Data

Load NHANES datasets

Load the various environmental sample datasets, with demographics information included.

bake(file="rds/deet.rds",{
  # downloading all of the data with demographics information attached. 
  nhanes_load_data("OPD_D", "2005-2006" , demographics=TRUE) %>%
    dplyr::select(-file_name, -begin_year, -end_year) -> dat1
  nhanes_load_data("OPD_E", "2007-2008" , demographics=TRUE) %>%
    dplyr::select(-file_name, -begin_year, -end_year) -> dat2
  nhanes_load_data("OPD_G", "2011-2012" , demographics=TRUE) %>%
    dplyr::select(-file_name, -begin_year, -end_year) -> dat3

  # joining the data by lab results
  natural_join(dat1, dat2, by = "SEQN", jointype = "FULL") %>%
    natural_join(dat3, by = "SEQN", jointype = "FULL") -> opd

  # Only 1 row per SEQN
  stopifnot(
    opd %>%
      group_by(SEQN) %>%
      summarise(total=n()) %>%
      filter(total>1) %>% nrow()==0
  )

  #################################################################

  nhanes_load_data("CARB_D", "2005-2006" , demographics=TRUE) %>%
    dplyr::select(-file_name, -begin_year, -end_year) -> dat4
  nhanes_load_data("CARB_E", "2007-2008" , demographics=TRUE) %>%
    dplyr::select(-file_name, -begin_year, -end_year) -> dat5
  
  natural_join(dat4, dat5, by = "SEQN", jointype = "FULL") %>%
    natural_join(opd, by = "SEQN", jointype = "FULL") -> carb

  # Only 1 row per SEQN
  stopifnot(
    carb %>%
      group_by(SEQN) %>%
      summarise(total=n()) %>%
      filter(total>1) %>% nrow()==0
  )

  #################################################################

  nhanes_load_data("LAB26PP", "1999-2000" , demographics=TRUE) %>%
    dplyr::select(-file_name, -begin_year, -end_year) -> dat6
  nhanes_load_data("L26PP_B", "2001-2002" , demographics=TRUE) %>%
    dplyr::select(-file_name, -begin_year, -end_year) -> dat7

  # by removing "file_name", we avoided 5,450 duplicates
  natural_join(dat6, dat7, by = "SEQN", jointype = "FULL") %>%
    natural_join(carb, by = "SEQN", jointype = "FULL") -> pp

  # Only 1 row per SEQN
  stopifnot(
    pp %>%
      group_by(SEQN) %>%
      summarise(total=n()) %>%
      filter(total>1) %>% nrow()==0
  )

  #################################################################

  nhanes_load_data("L26UPP_C", "2003-2004" , demographics=TRUE) %>%
    dplyr::select(-file_name, -begin_year, -end_year) -> dat8
  nhanes_load_data("UPP_D", "2005-2006" , demographics=TRUE) %>%
    dplyr::select(-file_name, -begin_year, -end_year) -> dat9
  nhanes_load_data("UPP_E", "2007-2008" , demographics=TRUE) %>%
    dplyr::select(-file_name, -begin_year, -end_year) -> dat10

  # by removing "file_name", we avoided 10,900 duplicates
  natural_join(dat8, dat9, by = "SEQN", jointype = "FULL") %>%
    natural_join(dat10, by = "SEQN", jointype = "FULL") %>%
    natural_join(pp, by = "SEQN", jointype = "FULL") -> upp

  # Only 1 row per SEQN
  stopifnot(
    upp %>%
      group_by(SEQN) %>%
      summarise(total=n()) %>%
      filter(total>1) %>% nrow()==0
  )

  #################################################################

  nhanes_load_data("L24PP_C", "2003-2004" , demographics=TRUE) %>%
    dplyr::select(-file_name, -begin_year, -end_year) -> dat11
  nhanes_load_data("PP_D", "2005-2006" , demographics=TRUE) %>%
    dplyr::select(-file_name, -begin_year, -end_year) -> dat12
  nhanes_load_data("PP_E", "2007-2008" , demographics=TRUE) %>%
    dplyr::select(-file_name, -begin_year, -end_year) -> dat13
  nhanes_load_data("PP_F", "2009-2010" , demographics=TRUE) %>%
    dplyr::select(-file_name, -begin_year, -end_year) -> dat14
  nhanes_load_data("PP_G", "2011-2012" , demographics=TRUE) %>% #incomplete dataset
    dplyr::select(-file_name, -begin_year, -end_year) %>%
    mutate(WTSA2YR = coalesce(WTSA2YR, 0L)) -> dat15
  nhanes_load_data("EPHPP_H", "2013-2014" , demographics=TRUE) %>%
    dplyr::select(-file_name, -begin_year, -end_year) -> dat16

  natural_join(dat11, dat12, by = "SEQN", jointype = "FULL") %>%
    natural_join(dat13, by = "SEQN", jointype = "FULL") %>%
    natural_join(dat14, by = "SEQN", jointype = "FULL") %>%
    natural_join(dat15, by = "SEQN", jointype = "FULL") %>%
    natural_join(dat16, by = "SEQN", jointype = "FULL") %>%
    natural_join(upp, by = "SEQN", jointype = "FULL") -> L24

  # Only 1 row per SEQN
  stopifnot(
    L24 %>%
      group_by(SEQN) %>%
      summarise(total=n()) %>%
      filter(total>1) %>% nrow()==0
  )

  #################################################################

  nhanes_load_data("LAB28POC", "1999-2000" , demographics=TRUE) %>%
    dplyr::select(-file_name, -begin_year, -end_year) -> dat17
  nhanes_load_data("L28POC_B", "2001-2002" , demographics=TRUE) %>%
    dplyr::select(-file_name, -begin_year, -end_year) -> dat18
  nhanes_load_data("L28OCP_C", "2003-2004" , demographics=TRUE) %>% #incomplete dataset
    dplyr::select(-file_name, -begin_year, -end_year) -> dat19

  natural_join(dat17, dat18, by = "SEQN", jointype = "FULL") %>%
    natural_join(dat19, by = "SEQN", jointype = "FULL") %>%
    natural_join(L24, by = "SEQN", jointype = "FULL") -> L28

  # Only 1 row per SEQN
  stopifnot(
    L28 %>%
      group_by(SEQN) %>%
      summarise(total=n()) %>%
      filter(total>1) %>% nrow()==0
  )

  # L28 %>%
  #     group_by(SEQN) %>%
  #     summarise(total=n()) %>%
  #     filter(total>1) %>% pull(SEQN) -> duplicate_SEQN
  # 
  # library(arsenal)
  # L28 %>% filter(SEQN==7) ->a
  # comparedf(a[1,],a[2,])

  #################################################################

  nhanes_load_data("UPHOPM_E", "2007-2008" , demographics=TRUE) %>%
    dplyr::select(-file_name, -begin_year, -end_year) -> dat20
  nhanes_load_data("UPHOPM_F", "2009-2010" , demographics=TRUE) %>%
    dplyr::select(-file_name, -begin_year, -end_year) -> dat21
  nhanes_load_data("UPHOPM_G", "2011-2012" , demographics=TRUE) %>%
    dplyr::select(-file_name, -begin_year, -end_year) -> dat22
  nhanes_load_data("UPHOPM_H", "2013-2014" , demographics=TRUE) %>%
    dplyr::select(-file_name, -begin_year, -end_year) -> dat23

  natural_join(dat20, dat21, by = "SEQN", jointype = "FULL") %>%
    natural_join(dat22, by = "SEQN", jointype = "FULL") %>%
    natural_join(dat23, by = "SEQN", jointype = "FULL") %>%
    natural_join(L28, by = "SEQN", jointype = "FULL") -> uphop

  # Only 1 row per SEQN
  stopifnot(
    uphop %>%
      group_by(SEQN) %>%
      summarise(total=n()) %>%
      filter(total>1) %>% nrow()==0
  )

  #################################################################

  nhanes_load_data("DEET_E", "2007-2008" , demographics=TRUE) %>%
    dplyr::select(-file_name, -begin_year, -end_year) -> dat24
  nhanes_load_data("DEET_F", "2009-2010" , demographics=TRUE) %>%
    dplyr::select(-file_name, -begin_year, -end_year) -> dat25
  nhanes_load_data("DEET_G", "2011-2012" , demographics=TRUE) %>%
    dplyr::select(-file_name, -begin_year, -end_year) -> dat26
  nhanes_load_data("DEET_H", "2013-2014" , demographics=TRUE) %>%
    dplyr::select(-file_name, -begin_year, -end_year) -> dat27

  nhanes_load_data("UAM_E", "2007-2008" , demographics=TRUE) %>%
    dplyr::select(-file_name, -begin_year, -end_year) -> uam

  natural_join(uphop,dat24, by = "SEQN", jointype = "FULL") %>%
    natural_join(dat25, by = "SEQN", jointype = "FULL") %>%
    natural_join(dat26, by = "SEQN", jointype = "FULL") %>%
    natural_join(dat27, by = "SEQN", jointype = "FULL") %>%
    natural_join(uam, by = "SEQN", jointype = "FULL") -> deet
}) -> deet

# Only 1 row per SEQN
stopifnot(
  deet %>%
    group_by(SEQN) %>%
    summarise(total=n()) %>%
    filter(total>1) %>% nrow()==0
)

# SEQNs for chemical data
deet %>% arrange(SEQN) %>% pull(SEQN) %>% unique() -> chem_assoc_SEQN

Load other demographic data.

bake(file="rds/occ.rds",{
# Inputting occupational data
# Vars: SEQN, OCD230, OCD231, OCD240, OCD241, OCD390, OCD391, OCD392
  remove_all_labels(nhanes('OCQ')) %>%
    mutate(cycle = "1999-2000", SDDSRVYR = 1) -> occ_2000
  remove_all_labels(nhanes('OCQ_B')) %>%
    mutate(cycle = "2001-2002", SDDSRVYR = 2) -> occ_2002
  remove_all_labels(nhanes('OCQ_C')) %>%
    mutate(cycle = "2003-2004", SDDSRVYR = 3) -> occ_2004
  remove_all_labels(nhanes('OCQ_D')) %>%
    mutate(cycle = "2005-2006", SDDSRVYR = 4) -> occ_2006
  remove_all_labels(nhanes('OCQ_E')) %>%
    mutate(cycle = "2007-2008", SDDSRVYR = 5) -> occ_2008
  remove_all_labels(nhanes('OCQ_F')) %>%
    mutate(cycle = "2009-2010", SDDSRVYR = 6) -> occ_2010
  remove_all_labels(nhanes('OCQ_G')) %>%
    mutate(cycle = "2011-2012", SDDSRVYR = 7) -> occ_2012
  remove_all_labels(nhanes('OCQ_H')) %>%
    mutate(cycle = "2013-2014", SDDSRVYR = 8) -> occ_2014
  # occ_2016 <- nhanes('OCQ_I')
  
  natural_join(occ_2000,occ_2002, by = "SEQN", jointype = "FULL") %>%
    natural_join(occ_2004, by = "SEQN", jointype = "FULL") %>%
    natural_join(occ_2006, by = "SEQN", jointype = "FULL") %>%
    natural_join(occ_2008, by = "SEQN", jointype = "FULL") %>%
    natural_join(occ_2010, by = "SEQN", jointype = "FULL") %>%
    natural_join(occ_2012, by = "SEQN", jointype = "FULL") %>%
    natural_join(occ_2014, by = "SEQN", jointype = "FULL") %>%
    dplyr::select(SEQN, cycle, SDDSRVYR, OCD230, OCD231, OCD240, OCD241, OCD390, OCD391, OCD392) %>%
    filter_at(vars(starts_with("OC")), any_vars(!is.na(.))) %>%
    mutate(farmwork = case_when(
      OCD230 %in% c(1,2) ~ 1,
      OCD231 ==   1      ~ 1,
      OCD390 %in% c(1,2) ~ 1,
      OCD391 ==   1      ~ 1,
      OCD392 ==   18     ~ 1,
      TRUE               ~ 0
    ),
    farmwork_sp = case_when(
      OCD230 %in% c(1,2) ~ 1,
      OCD231 ==   1      ~ 1,
      OCD240 %in% c(1,2) ~ 1,
      OCD241 ==   18     ~ 1,
      OCD390 %in% c(1,2) ~ 1,
      OCD391 ==   1      ~ 1,
      OCD392 ==   18     ~ 1,
      TRUE               ~ 0
    )) %>%
    filter(SEQN %in% chem_assoc_SEQN) -> occ
  }) -> occ

# Only 1 row per SEQN
stopifnot(
  occ %>%
    group_by(SEQN) %>%
    summarise(total=n()) %>%
    filter(total>1) %>% nrow()==0
)

# CBC
# cbc_2000 <- nhanes('LAB25')
# cbc_2002 <- nhanes('L25_B')
# cbc_2004 <- nhanes('L25_C')
# cbc_2006 <- nhanes('CBC_D')
# cbc_2008 <- nhanes('CBC_E')
# cbc_2010 <- nhanes('CBC_F')
# cbc_2012 <- nhanes('CBC_G')  #double check this dataset
# cbc_2014 <- nhanes('CBC_H')
# cbc_2016 <- nhanes('CBC_I')

bake(file="rds/bmi.rds",{
  # BMI dataset
  # Vars: SEQN, BMXBMI
  remove_all_labels(nhanes('BMX')) %>%
    mutate(cycle = "1999-2000", SDDSRVYR = 1) -> bmi2000
  remove_all_labels(nhanes('BMX_B')) %>%
    mutate(cycle = "2001-2002", SDDSRVYR = 2) -> bmi2002
  remove_all_labels(nhanes('BMX_C')) %>%
    mutate(cycle = "2003-2004", SDDSRVYR = 3) -> bmi2004
  remove_all_labels(nhanes('BMX_D')) %>%
    mutate(cycle = "2005-2006", SDDSRVYR = 4) -> bmi2006
  remove_all_labels(nhanes('BMX_E')) %>%
    mutate(cycle = "2007-2008", SDDSRVYR = 5) -> bmi2008
  remove_all_labels(nhanes('BMX_F')) %>%
    mutate(cycle = "2009-2010", SDDSRVYR = 6) -> bmi2010
  remove_all_labels(nhanes('BMX_G')) %>%
    mutate(cycle = "2011-2012", SDDSRVYR = 7) -> bmi2012
  remove_all_labels(nhanes('BMX_H')) %>%
    mutate(cycle = "2013-2014", SDDSRVYR = 8) -> bmi2014
  # bmi2016 <- nhanes('BMX_I')
  
  natural_join(bmi2000,bmi2002, by = "SEQN", jointype = "FULL") %>%
    natural_join(bmi2004, by = "SEQN", jointype = "FULL") %>%
    natural_join(bmi2006, by = "SEQN", jointype = "FULL") %>%
    natural_join(bmi2008, by = "SEQN", jointype = "FULL") %>%
    natural_join(bmi2010, by = "SEQN", jointype = "FULL") %>%
    natural_join(bmi2012, by = "SEQN", jointype = "FULL") %>%
    natural_join(bmi2014, by = "SEQN", jointype = "FULL") %>%
    dplyr::select(SEQN, cycle, BMXBMI) %>%
    filter(!is.na(BMXBMI)) %>%
    filter(SEQN %in% chem_assoc_SEQN) -> bmi
}) -> bmi

stopifnot(
  bmi %>%
    group_by(SEQN) %>%
    summarise(total=n()) %>%
    filter(total>1) %>% nrow()==0
)

Combine to final large dataset.

bake(file="rds/nhanes_all_dat.rds",{
  #merging datsets with the same variables
  natural_join(deet, occ, by = "SEQN", jointype = "FULL") %>%
    natural_join(bmi, by = "SEQN", jointype = "FULL") %>%
    mutate(edu = DMDEDUC2,
           edu = case_when(
      DMDEDUC3 %in% c(1:8,55,66) ~ 1,
      DMDEDUC3 %in% c(9:12)      ~ 2,
      DMDEDUC3 %in% c(13:14)     ~ 3,
      DMDEDUC3 ==   15           ~ 4,
      DMDEDUC3 %in% c(77,99)     ~ NA_real_,
      DMDEDUC2 %in% c(7,9)       ~ NA_real_,
      TRUE                       ~ as.numeric(DMDEDUC2)
     )) %>%
    mutate(
           #URXCPF   = URXCPM,
           #URXCPO   = URXCPM,
           URDCPFLC = URDCPMLC,
           URDCPOLC = URDCPMLC,
           LBXBHC   = coalesce(LBXBHC, LBDBHC),
           LBXBHCLA = coalesce(LBXBHCLA, LBDBHCLA)) %>%
    rename(LBX199   = LBD199,
           URXSIM   = URXSISM) %>%
    dplyr::select(-LBDBHC, -LBDBHCLA) -> all_dat
}) -> all_dat

# Only 1 row per SEQN
stopifnot(
  all_dat %>%
    group_by(SEQN) %>%
    summarise(total=n()) %>%
    filter(total>1) %>% nrow()==0
)

Create weighting variables

Create weighting variables to be used for the chemicals.

# raw, lipid adjusted, positive= LA, LC
all_dat %>%
  dplyr::select(!starts_with(c("WTI","WTMR","SIA",
                               "MIA","FIA","AIA","DMQ"))) %>%
  mutate(
    WTMEC16YR = case_when(
      SDDSRVYR %in% c(1,2)   ~ (2/8)*WTMEC4YR, # for 1999-2002
      SDDSRVYR %in% c(3:8)   ~ (1/8)*WTMEC2YR, # for 2003-2014
      TRUE                   ~ NA_real_),
    WT_LBXBHC = case_when(
      is.finite(WTSPO4YR) & SDDSRVYR %in% c(1,2)   ~ (2/3)*WTSPO4YR, # for 1999-2002, 12+ yrs 1/3 subsample
      is.finite(WTSB2YR)  & SDDSRVYR %in% c(3)     ~ (1/3)*WTSB2YR,  # for 2003-2004, 12+ yrs 1/3 subsample
      TRUE                                         ~ NA_real_
      ),
    WT_LBXDIE = case_when(
      is.finite(WTSPO2YR) & SDDSRVYR %in% c(2)     ~ (1/2)*WTSPO2YR, # for 2001-2002, 12+ yrs 1/3 subsample
      is.finite(WTSB2YR)  & SDDSRVYR %in% c(3)     ~ (1/2)*WTSB2YR,  # for 2003-2004, 12+ yrs 1/3 subsample
      TRUE                                         ~ NA_real_
      ),
    WT_LBXHPE = case_when(
      is.finite(WTSPO4YR) & SDDSRVYR %in% c(1,2)   ~ (2/3)*WTSPO4YR, # for 1999-2002, 12+ yrs 1/3 subsample
      is.finite(WTSB2YR)  & SDDSRVYR %in% c(3)     ~ (1/3)*WTSB2YR,  # for 2003-2004, 12+ yrs 1/3 subsample
      TRUE                                         ~ NA_real_
      ),
    WT_LBXPDE = case_when(
      is.finite(WTSPO4YR) & SDDSRVYR %in% c(1,2)   ~ (2/3)*WTSPO4YR, # for 1999-2002, 12+ yrs 1/3 subsample
      is.finite(WTSB2YR)  & SDDSRVYR %in% c(3)     ~ (1/3)*WTSB2YR,  # for 2003-2004, 12+ yrs 1/3 subsample
      TRUE                                         ~ NA_real_
      ),
    WT_LBXPDT = case_when(
      is.finite(WTSPO4YR) & SDDSRVYR %in% c(1,2)   ~ (2/3)*WTSPO4YR, # for 1999-2002, 12+ yrs 1/3 subsample
      is.finite(WTSB2YR)  & SDDSRVYR %in% c(3)     ~ (1/3)*WTSB2YR,  # for 2003-2004, 12+ yrs 1/3 subsample
      TRUE                                         ~ NA_real_
      ),
    WT_URX24D = case_when(
      is.finite(WTSPP4YR) & SDDSRVYR %in% c(1,2)   ~ (2/7)*WTSPP4YR, # for 1999-2002, 6-11 yrs 1/2 subsample, 12+ yrs 1/3 subsample
      is.finite(WTSC2YR)  & SDDSRVYR %in% c(3,5:8) ~ (1/7)*WTSC2YR,  # for 2003-2004, 2007-2014, 6+ yrs 1/3 subsample
      TRUE                                         ~ NA_real_
      ),
    WT_URXDEA = case_when(
      is.finite(WTSC2YR)  & SDDSRVYR %in% c(5:8)   ~ (1/4)*WTSC2YR,  # for 2007-2014, 6+ yrs 1/3 subsample
      TRUE                                         ~ NA_real_
      ),
    WT_URXOPM = case_when(
      is.finite(WTSPP4YR) & SDDSRVYR %in% c(1,2)   ~ (2/6)*WTSPP4YR, # for 1999-2002, 6-11 yrs 1/2 subsample, 12+ yrs 1/3 subsample
      is.finite(WTSC2YR)  & SDDSRVYR %in% c(5:8)   ~ (1/6)*WTSC2YR,  # for 2007-2014, 6+ yrs 1/3 subsample
      TRUE                                         ~ NA_real_
      ),
    WT_URXPAR = case_when(
      is.finite(WTSPP4YR) & SDDSRVYR %in% c(1,2)   ~ (2/6)*WTSPP4YR, # for 1999-2002, 6-11 yrs 1/2 subsample, 12+ yrs 1/3 subsample
      is.finite(WTSC2YR)  & SDDSRVYR %in% c(5:8)   ~ (1/6)*WTSC2YR,  # for 2007-2014, 6+ yrs 1/3 subsample
      TRUE                                         ~ NA_real_
      ),
    WT_URXCPM = case_when(
      is.finite(WTSPP4YR) & SDDSRVYR %in% c(1,2)   ~ (2/5)*WTSPP4YR, # for 1999-2002, 6-11 yrs 1/2 subsample, 12+ yrs 1/3 subsample
      is.finite(WTSC2YR)  & SDDSRVYR %in% c(5,6,8) ~ (1/5)*WTSC2YR,  # for 2007-2010, 2013-2014, 6+ yrs 1/3 subsample
      TRUE                                         ~ NA_real_
      ),
    WT_URX14D = case_when(
      is.finite(WTSC2YR)  & SDDSRVYR %in% c(3)     ~ (1/6)*WTSC2YR,  # for 2003-2004, 6+ yrs 1/3 subsample
      is.finite(WTSB2YR)  & SDDSRVYR %in% c(4:6,8) ~ (1/6)*WTSB2YR,  # for 2005-2010, 2013-2014, 6+ yrs 1/3 subsample
      is.finite(WTSA2YR)  & SDDSRVYR %in% c(7)     ~ (1/6)*WTSA2YR,  # for 2011-2012, 6+ yrs 1/3 subsample
      TRUE                                         ~ NA_real_
      ),
    WT_URXDCB = case_when(
      is.finite(WTSC2YR)  & SDDSRVYR %in% c(3)     ~ (1/6)*WTSC2YR,  # for 2003-2004, 6+ yrs 1/3 subsample
      is.finite(WTSB2YR)  & SDDSRVYR %in% c(4:6,8) ~ (1/6)*WTSB2YR,  # for 2005-2010, 2013-2014, 6+ yrs 1/3 subsample
      is.finite(WTSA2YR)  & SDDSRVYR %in% c(7)     ~ (1/6)*WTSA2YR,  # for 2011-2012, 6+ yrs 1/3 subsample
      TRUE                                         ~ NA_real_
      )) -> all_dat2

data.frame(chemical = c("LBXBHC", "LBXDIE", "LBXHPE", "LBXPDE", "LBXPDT",
                        "URX24D", "URXDEA", "URXOPM", "URXPAR", "URXCPM",
                        "URX14D", "URXDCB"),
           weight =   c("WT_LBXBHC", "WT_LBXDIE", "WT_LBXHPE", "WT_LBXPDE", "WT_LBXPDT",
                        "WT_URX24D", "WT_URXDEA", "WT_URXOPM", "WT_URXPAR", "WT_URXCPM",
                        "WT_URX14D", "WT_URXDCB")) -> chem_weight_names

Format data

Extend chemical data into a long format.

Several chemicals do not have LC values:

bake(file="rds/chem_long.rds",{
  # LBs
  all_dat2 %>%
    dplyr::select(SEQN,starts_with("LB")) %>%
    dplyr::select(SEQN,ends_with("LC")) %>%
    gather(key=chemical,value=LC, -SEQN) %>%
    filter(!is.na(LC)) %>%
    mutate(chemical = stringr::str_replace(chemical, "^LBD", "LBX"),
           chemical = stringr::str_replace(chemical, "LC$", "")) -> dat_lblc

  all_dat2 %>%
    dplyr::select(SEQN,starts_with("LB")) %>%
    dplyr::select(SEQN,ends_with("LA")) %>%
    gather(key=chemical,value=LA, -SEQN) %>%
    filter(!is.na(LA)) %>%
    mutate(chemical = stringr::str_replace(chemical, "LA$", "")) -> dat_lbla

  all_dat2 %>%
    dplyr::select(SEQN,starts_with("LB")) %>%
    dplyr::select(!ends_with(c("LA","LC"))) %>%
    gather(key=chemical,value=measurement, -SEQN) %>%
    filter(!is.na(measurement)) %>%
    full_join(dat_lbla) %>%
    full_join(dat_lblc) -> dat_lb

  # URs
  all_dat2 %>%
    dplyr::select(SEQN,starts_with("UR")) %>%
    dplyr::select(-URXUCR) %>%
    dplyr::select(SEQN,ends_with("LC")) %>%
    gather(key=chemical,value=LC, -SEQN) %>%
    filter(!is.na(LC)) %>%
    mutate(chemical = stringr::str_replace(chemical, "^URD", "URX"),
           chemical = stringr::str_replace(chemical, "LC$", "")) -> dat_urlc

  all_dat2 %>%
    dplyr::select(SEQN,starts_with("UR")) %>%
    dplyr::select(!ends_with("LC")) %>%
    dplyr::select(-URXUCR) %>%
    gather(key=chemical,value=measurement, -SEQN) %>%
    filter(!is.na(measurement)) %>%
    mutate(LA = NA_real_) %>%
    full_join(dat_urlc) -> dat_ur

  # All chemicals
  rbind(dat_lb, dat_ur) -> chem_long
}) -> chem_long

# Do not have LC values
chem_long %>% filter(is.na(LC)) %>% pull(chemical) %>% unique()
## [1] "URXOP1" "URXOP2" "URXOP3" "URXOP4" "URXOP5" "URXOP6"

Demographic data.

# Everything else
all_dat2 %>%
  dplyr::select(SEQN, URXUCR) -> URXUCR

all_dat2 %>%
  dplyr::select(!starts_with("UR")) %>%
  dplyr::select(!starts_with("LB")) %>%
  dplyr::select(SEQN, SDDSRVYR, SDMVPSU, SDMVSTRA,
                farmwork, farmwork_sp, DMDCITZN, edu, 
                BMXBMI, RIDAGEYR, INDFMPIR, RIAGENDR,
                RIDRETH1, DMDBORN, WTMEC16YR, starts_with("WT_")) %>%
  left_join(URXUCR) %>%
  mutate(DMDCITZN    = ifelse(DMDCITZN %in% c(7,9), NA_real_, DMDCITZN),
         DMDCITZN    = ifelse(DMDCITZN ==1, 0, DMDCITZN), #citizen=0
         DMDCITZN    = ifelse(DMDCITZN ==2, 1, DMDCITZN), #non-citizen=1
         DMDCITZN    = as.factor(DMDCITZN),
         farmwork    = as.factor(farmwork),
         farmwork_sp = as.factor(farmwork_sp),
         SDDSRVYR    = as.factor(SDDSRVYR),
         RIAGENDR    = as.factor(RIAGENDR),
         edu         = as.factor(edu),
         RIDRETH1    = as.factor(RIDRETH1),
         farmwork_char2 = case_when(farmwork == 1 ~ "Farmworker",
                                   farmwork == 0 ~ "Non-Farmworker",
                                   TRUE          ~ NA_character_),
         farmwork_char = as.factor(farmwork_char2),
         citizen_char2  = case_when(DMDCITZN == 1 ~ "Non-Citizen",
                                   DMDCITZN == 0 ~ "US-Citizen",
                                   TRUE          ~ NA_character_),
         citizen_char  = as.factor(citizen_char2),
         gender_char   = as.factor(case_when(RIAGENDR == 1 ~ "Male",
                                   RIAGENDR == 2 ~ "Female",
                                   TRUE          ~ NA_character_)),
         race_char     = as.factor(case_when(RIDRETH1 == 1 ~ "Mexican American",
                                   RIDRETH1 == 2 ~ "Other Hispanic",
                                   RIDRETH1 == 3 ~ "Non-Hispanic White",
                                   RIDRETH1 == 4 ~ "Non-Hispanic Black",
                                   RIDRETH1 == 5 ~ "Other Race",
                                   TRUE          ~ NA_character_)),
         edu_char      = as.factor(case_when(edu      == 1 ~ "Less than 9th grade",
                                   edu      == 2 ~ "9-11th grade",
                                   edu      == 3 ~ "High school graduate/GED",
                                   edu      == 4 ~ "Some college or AA degree",
                                   edu      == 5 ~ "College graduate or above",
                                   TRUE          ~ NA_character_)),
         SDDSRVYR_char = as.factor(case_when(SDDSRVYR == 1 ~ "1999-2000",
                                   SDDSRVYR == 2 ~ "2001-2002",
                                   SDDSRVYR == 3 ~ "2003-2004",
                                   SDDSRVYR == 4 ~ "2005-2006",
                                   SDDSRVYR == 5 ~ "2007-2008",
                                   SDDSRVYR == 6 ~ "2009-2010",
                                   SDDSRVYR == 7 ~ "2011-2012",
                                   SDDSRVYR == 8 ~ "2013-2014"))
         ) -> demo_dat

factor(demo_dat$edu_char,
       levels = c("Less than 9th grade",
                  "9-11th grade",
                  "High school graduate/GED",
                  "Some college or AA degree",
                  "College graduate or above")) -> demo_dat$edu_char

factor(demo_dat$race_char,
       levels = c("Mexican American",
                  "Other Hispanic",
                  "Non-Hispanic White",
                  "Non-Hispanic Black",
                  "Other Race")) -> demo_dat$race_char

factor(demo_dat$gender_char,
       levels = c("Male",
                  "Female")) -> demo_dat$gender_char

Load Toxcats and molecular weights datasets

Load toxcast data.

read_delim("data/pestsheet_convert.csv", delim=",",
           col_names=TRUE,
           col_types = "cccnciciinncnnnncncccc")->mw

mw %>% pull(casn) %>% unique() -> chems_casn

# ACC thresholds, where hitc==1
readRDS("data/Pesticide_Toxcast_5-3-21.rds") %>%
  select(chnm, casn, modl_acc, hitc) %>%
  filter(!is.na(chnm), hitc==1) %>%
  distinct() %>%
  group_by(chnm, casn) %>%
  summarise(chem_min_log = min(modl_acc)) %>%
  ungroup() -> acc_thresh_chem

readRDS("data/Pesticide_Toxcast_5-3-21.rds") %>%
  select(chnm, casn, IntendedTargetFamily, modl_acc, hitc) %>%
  filter(!is.na(chnm), !is.na(IntendedTargetFamily), hitc==1) %>%
  distinct() %>%
  group_by(IntendedTargetFamily, casn) %>%
  summarise(target_min_log = min(modl_acc)) %>%
  ungroup() -> acc_thresh_target

readRDS("data/Pesticide_Toxcast_5-3-21.rds") %>%
  dplyr::select(aenm, casn, chnm, hitc, modl_acc, IntendedTargetFamily,
                IntendedTargetFamilySub, GeneName, GeneSymbol) %>%
  filter(casn %in% chems_casn) %>%
  left_join(acc_thresh_target) %>%
  left_join(acc_thresh_chem) %>%
  group_by(casn, IntendedTargetFamily) %>%
  filter(any(!is.na(modl_acc))) %>%
  mutate(target_max_log = max(modl_acc, na.rm = TRUE),
         target_max_log = ifelse(is.finite(target_max_log), target_max_log, NA_real_)) %>%
  ungroup() %>%
  group_by(casn) %>%
  mutate(chem_max_log = max(modl_acc, na.rm = TRUE),
         chem_max_log = ifelse(is.finite(chem_max_log), chem_max_log, NA_real_)) %>%
  ungroup() %>%
  full_join(mw %>% dplyr::select(chemical, casn, mw,
                            conversion, density=Density, LOD)) -> toxcast

readRDS("data/Pesticide_Toxcast_5-3-21.rds") %>%
  dplyr::select(aenm, casn, chnm, hitc, modl_acc, IntendedTargetFamily,
                IntendedTargetFamilySub, GeneName, GeneSymbol) -> toxcast_assays

Calculate molarity

Calculate molarity for the chemicals.

For blood measurements (LBX), lipid adjusted measurements (LBXLA) have units of \(\frac{\mathrm{ng}}{\mathrm{g}}\). We are given molecular weights (\(mw\)) as \(\frac{\mathrm{g}}{\mathrm{mol}}\) and a serum density of 1.024 \(\frac{\mathrm{g}}{\mathrm{mL}}\). And so our conversion will look like:

\[ \frac{\mathrm{LBXLA} \cancel{\mathrm{ng}}}{\cancel{\mathrm{g}}} \cdot \frac{1 \cancel{\mathrm{g}}}{10^9 \cancel{\mathrm{ng}}} \frac{1 \cancel{\mathrm{mol}}}{mw \cancel{\mathrm{g}}} \cdot \frac{1.024 \cancel{\mathrm{g}}}{1 \cancel{\mathrm{mL}}} \cdot \frac{10^3 \cancel{\mathrm{mL}}}{1 \mathrm{L}} \cdot \frac{10^6 \mathrm{\mu mol}}{1 \cancel{\mathrm{mol}}} = \frac{\mathrm{LBXLA} \cdot 1.024}{mw} \frac{\mathrm{\mu mol}}{\mathrm{L}} \]

Most urinary measurements (URX) have units of \(\frac{\mathrm{\mu g}}{\mathrm{L}}\). We are given molecular weights (\(mw\)) as \(\frac{\mathrm{g}}{\mathrm{mol}}\). And so our conversion will look like:

\[ \frac{\mathrm{URX} \cancel{\mathrm{\mu g}}}{\cancel{\mathrm{L}}} \cdot \frac{1 \cancel{\mathrm{g}}}{10^6 \cancel{\mathrm{\mu g}}} \frac{1 \cancel{\mathrm{mol}}}{mw \cancel{\mathrm{g}}} \cdot \frac{10^6 \mathrm{\mu mol}}{1 \cancel{\mathrm{mol}}} = \frac{\mathrm{URX}}{mw} \frac{\mathrm{\mu mol}}{\mathrm{L}} \] Creatinine (URXUCR) has units of \(\frac{\mathrm{mg}}{\mathrm{dL}}\). We are given a molecular weight of \(113.12 \frac{\mathrm{g}}{\mathrm{mol}}\). And so our conversion will look like:

\[ \frac{\mathrm{URXUCR} \cancel{\mathrm{mg}}}{\cancel{\mathrm{dL}}} \cdot \frac{10 \cancel{\mathrm{dL}}}{1 \mathrm{L}} \cdot \frac{1 \cancel{\mathrm{g}}}{10^3 \cancel{\mathrm{mg}}} \frac{1 \cancel{\mathrm{mol}}}{113.12 \cancel{\mathrm{g}}} \cdot \frac{10^6 \mathrm{\mu mol}}{1 \cancel{\mathrm{mol}}} = \frac{\mathrm{URXUCR} \cdot 10^4}{113.12} \frac{\mathrm{\mu mol}}{\mathrm{L}} \]

bake(file="rds/full_chem_demo_dat.rds",{
  chem_long %>%
    left_join(toxcast %>%
                dplyr::select(chemical, casn, chem_min_log, mw,
                              conversion, density) %>%
                arrange(chemical, chem_min_log) %>%
                distinct()) %>%
    mutate(molarity = 
             #ifelse(is.finite(LA), LA/mw, measurement/mw)
             case_when(
               grepl('^UR', chemical) ~ (measurement/mw), # umol/L
               grepl('^LB', chemical) ~ (LA/mw)*1.024, # umol/L
               TRUE ~ NA_real_
             )
    ) %>%
    mutate(indiv_chem_bioactiv = ifelse(log(molarity) >= chem_min_log, 1, 0)) %>%
    group_by(SEQN) %>%
    mutate(bioactiv_any = max(indiv_chem_bioactiv, na.rm = TRUE),
           bioactiv_any = ifelse(is.finite(bioactiv_any), bioactiv_any, NA_real_),
           bioactiv_count = sum(indiv_chem_bioactiv, na.rm = TRUE)) %>%
    ungroup()  %>%
    filter(!is.na(chem_min_log)) -> chem_long2
  
  # Creatinine molecular weight 113.12 mol/L, final molarity in umol/L
  left_join(chem_long2, demo_dat) %>%
    mutate(URXUCR_umol = (URXUCR/113.12)*10000,
           URXUCR_mol = URXUCR_umol/1000000,
           molarity_log_ratio = log(molarity/URXUCR_umol)) -> full_chem_demo_dat
}) -> full_chem_demo_dat

full_chem_demo_dat %>%
  select(SEQN, SDDSRVYR, chemical, molarity, molarity_log_ratio,
         indiv_chem_bioactiv, URXUCR_umol) -> chem_vars_for_analysis

Demographics

Stratified by farmwork category

min_age <- 18

demo_dat %>%
  mutate(nomiss.adj = !is.na(farmwork) & !is.na(DMDCITZN) &
             !is.na(BMXBMI) & !is.na(RIDAGEYR) & !is.na(SDDSRVYR) &
             !is.na(RIAGENDR) & !is.na(edu) & !is.na(RIDRETH1) &
             RIDAGEYR >= min_age & SEQN %in% chem_assoc_SEQN) -> temp_dat

svydesign(strata  = temp_dat$SDMVSTRA,  id   = temp_dat$SDMVPSU,
          weights = temp_dat$WTMEC16YR, data = temp_dat, nest=T) -> nhanes.svy

subset(nhanes.svy, nomiss.adj) -> svy.nomiss.adj

svy.nomiss.adj %>% 
  tbl_svysummary(
    # Use a character variable here. A factor leads to an error
    by = farmwork_char2,
    label = list(BMXBMI        ~ "Body Mass Index ",
                 RIDAGEYR      ~ "Age in Years",
                 SDDSRVYR_char ~ "Survey Year",
                 gender_char   ~ "Gender",
                 race_char     ~ "Racial Ethnicity",
                 citizen_char  ~ "U.S. Citizenship",
                 edu_char      ~ "Education Level"),
    # Use include to select variables
    include = c(BMXBMI, RIDAGEYR, SDDSRVYR_char,
                gender_char, race_char, citizen_char, edu_char),
    statistic = list(all_continuous()  ~ "{mean} ({sd})",
                     all_categorical() ~ "{n}    ({p}%)"),
    digits = list(BMXBMI ~ c(2, 2),
                  RIDAGEYR ~ c(2, 2),
                  all_categorical() ~ c(0, 1))
  ) %>%
  modify_header(label = "**Variable**") %>%
  modify_caption("Weighted descriptive statistics by farmwork category") %>%
  bold_labels()
Weighted descriptive statistics by farmwork category
Variable Farmworker, N = 3,812,6341 Non-Farmworker, N = 118,980,4181
Body Mass Index 28.64 (6.26) 28.46 (6.62)
Age in Years 49.19 (18.06) 45.67 (17.13)
Survey Year
    1999-2000 344,535 (9.0%) 7,367,362 (6.2%)
    2001-2002 449,822 (11.8%) 8,016,409 (6.7%)
    2003-2004 1,008,873 (26.5%) 15,884,012 (13.4%)
    2005-2006 282,165 (7.4%) 17,229,631 (14.5%)
    2007-2008 296,446 (7.8%) 17,498,191 (14.7%)
    2009-2010 603,995 (15.8%) 17,279,903 (14.5%)
    2011-2012 507,020 (13.3%) 17,228,566 (14.5%)
    2013-2014 319,778 (8.4%) 18,476,344 (15.5%)
Gender
    Male 2,360,387 (61.9%) 57,752,364 (48.5%)
    Female 1,452,247 (38.1%) 61,228,053 (51.5%)
Racial Ethnicity
    Mexican American 632,280 (16.6%) 8,797,498 (7.4%)
    Other Hispanic 89,501 (2.3%) 5,889,259 (4.9%)
    Non-Hispanic White 2,692,264 (70.6%) 82,759,734 (69.6%)
    Non-Hispanic Black 234,043 (6.1%) 13,686,588 (11.5%)
    Other Race 164,547 (4.3%) 7,847,338 (6.6%)
U.S. Citizenship
    Non-Citizen 598,909 (15.7%) 9,765,770 (8.2%)
    US-Citizen 3,213,725 (84.3%) 109,214,647 (91.8%)
Education Level
    Less than 9th grade 663,189 (17.4%) 6,057,572 (5.1%)
    9-11th grade 486,013 (12.7%) 14,604,709 (12.3%)
    High school graduate/GED 859,352 (22.5%) 28,462,959 (23.9%)
    Some college or AA degree 1,007,437 (26.4%) 37,246,172 (31.3%)
    College graduate or above 796,643 (20.9%) 32,609,006 (27.4%)
1 Mean (SD); n (%)
svyranktest(BMXBMI ~ farmwork, svy.nomiss.adj, test = c("wilcoxon"))
## 
##  Design-based KruskalWallis test
## 
## data:  BMXBMI ~ farmwork
## t = 1.0707, df = 122, p-value = 0.2864
## alternative hypothesis: true difference in mean rank score is not equal to 0
## sample estimates:
## difference in mean rank score 
##                    0.01481113
svyranktest(RIDAGEYR ~ farmwork, svy.nomiss.adj, test = c("wilcoxon"))
## 
##  Design-based KruskalWallis test
## 
## data:  RIDAGEYR ~ farmwork
## t = 3.8711, df = 122, p-value = 0.0001755
## alternative hypothesis: true difference in mean rank score is not equal to 0
## sample estimates:
## difference in mean rank score 
##                    0.05592827
svychisq( ~ farmwork + SDDSRVYR, svy.nomiss.adj, statistic = c("Chisq"))
## 
##  Pearson's X^2: Rao & Scott adjustment
## 
## data:  svychisq(~farmwork + SDDSRVYR, svy.nomiss.adj, statistic = c("Chisq"))
## X-squared = 204.94, df = 7, p-value = 7.299e-08
svychisq( ~ farmwork + RIAGENDR, svy.nomiss.adj, statistic = c("Chisq"))
## 
##  Pearson's X^2: Rao & Scott adjustment
## 
## data:  svychisq(~farmwork + RIAGENDR, svy.nomiss.adj, statistic = c("Chisq"))
## X-squared = 52.591, df = 1, p-value = 5.241e-06
svychisq( ~ farmwork + RIDRETH1, svy.nomiss.adj, statistic = c("Chisq"))
## 
##  Pearson's X^2: Rao & Scott adjustment
## 
## data:  svychisq(~farmwork + RIDRETH1, svy.nomiss.adj, statistic = c("Chisq"))
## X-squared = 115.71, df = 4, p-value < 2.2e-16
svychisq( ~ farmwork + DMDCITZN, svy.nomiss.adj, statistic = c("Chisq"))
## 
##  Pearson's X^2: Rao & Scott adjustment
## 
## data:  svychisq(~farmwork + DMDCITZN, svy.nomiss.adj, statistic = c("Chisq"))
## X-squared = 53.518, df = 1, p-value = 1.124e-09
svychisq( ~ farmwork + edu, svy.nomiss.adj, statistic = c("Chisq"))
## 
##  Pearson's X^2: Rao & Scott adjustment
## 
## data:  svychisq(~farmwork + edu, svy.nomiss.adj, statistic = c("Chisq"))
## X-squared = 221.12, df = 4, p-value < 2.2e-16

Stratified by citizenship with history of farmwork

demo_dat %>%
  mutate(nomiss.adj = farmwork==1 & !is.na(DMDCITZN) &
             !is.na(BMXBMI) & !is.na(RIDAGEYR) & !is.na(SDDSRVYR) &
             !is.na(RIAGENDR) & !is.na(edu) & !is.na(RIDRETH1) &
             RIDAGEYR >= min_age & SEQN %in% chem_assoc_SEQN) -> temp_dat

svydesign(strata  = temp_dat$SDMVSTRA,  id   = temp_dat$SDMVPSU,
          weights = temp_dat$WTMEC16YR, data = temp_dat, nest=T) -> nhanes.svy

subset(nhanes.svy, nomiss.adj) -> svy.nomiss.adj

svy.nomiss.adj %>% 
  tbl_svysummary(
    # Use a character variable here. A factor leads to an error
    by = citizen_char2,
    label = list(BMXBMI        ~ "Body Mass Index ",
                 RIDAGEYR      ~ "Age in Years",
                 SDDSRVYR_char ~ "Survey Year",
                 gender_char   ~ "Gender",
                 race_char     ~ "Racial Ethnicity",
                 edu_char      ~ "Education Level"),
    # Use include to select variables
    include = c(BMXBMI, RIDAGEYR, SDDSRVYR_char,
                gender_char, race_char, edu_char),
    statistic = list(all_continuous()  ~ "{mean} ({sd})",
                     all_categorical() ~ "{n}    ({p}%)"),
    digits = list(BMXBMI ~ c(2, 2),
                  RIDAGEYR ~ c(2, 2),
                  all_categorical() ~ c(0, 1))
  ) %>%
  modify_header(label = "**Variable**") %>%
  modify_caption("Weighted descriptive statistics of farmworkers by citizenship") %>%
  bold_labels()
Weighted descriptive statistics of farmworkers by citizenship
Variable Non-Citizen, N = 598,9091 US-Citizen, N = 3,213,7251
Body Mass Index 27.98 (4.58) 28.76 (6.52)
Age in Years 41.67 (15.06) 50.59 (18.23)
Survey Year
    1999-2000 41,918 (7.0%) 302,618 (9.4%)
    2001-2002 53,139 (8.9%) 396,684 (12.3%)
    2003-2004 73,413 (12.3%) 935,460 (29.1%)
    2005-2006 84,468 (14.1%) 197,697 (6.2%)
    2007-2008 49,426 (8.3%) 247,020 (7.7%)
    2009-2010 127,396 (21.3%) 476,599 (14.8%)
    2011-2012 88,019 (14.7%) 419,001 (13.0%)
    2013-2014 81,130 (13.5%) 238,647 (7.4%)
Gender
    Male 437,950 (73.1%) 1,922,438 (59.8%)
    Female 160,959 (26.9%) 1,291,287 (40.2%)
Racial Ethnicity
    Mexican American 443,852 (74.1%) 188,428 (5.9%)
    Other Hispanic 28,601 (4.8%) 60,899 (1.9%)
    Non-Hispanic White 58,702 (9.8%) 2,633,562 (81.9%)
    Non-Hispanic Black 15,997 (2.7%) 218,046 (6.8%)
    Other Race 51,757 (8.6%) 112,790 (3.5%)
Education Level
    Less than 9th grade 346,565 (57.9%) 316,623 (9.9%)
    9-11th grade 88,363 (14.8%) 397,651 (12.4%)
    High school graduate/GED 109,668 (18.3%) 749,684 (23.3%)
    Some college or AA degree 17,137 (2.9%) 990,299 (30.8%)
    College graduate or above 37,175 (6.2%) 759,468 (23.6%)
1 Mean (SD); n (%)
svyranktest(BMXBMI ~ DMDCITZN, svy.nomiss.adj, test = c("wilcoxon"))
## 
##  Design-based KruskalWallis test
## 
## data:  BMXBMI ~ DMDCITZN
## t = -0.80681, df = 82, p-value = 0.4221
## alternative hypothesis: true difference in mean rank score is not equal to 0
## sample estimates:
## difference in mean rank score 
##                   -0.01894701
svyranktest(RIDAGEYR ~ DMDCITZN, svy.nomiss.adj, test = c("wilcoxon"))
## 
##  Design-based KruskalWallis test
## 
## data:  RIDAGEYR ~ DMDCITZN
## t = -5.8706, df = 82, p-value = 8.894e-08
## alternative hypothesis: true difference in mean rank score is not equal to 0
## sample estimates:
## difference in mean rank score 
##                    -0.1444347
svychisq( ~ DMDCITZN + SDDSRVYR, svy.nomiss.adj, statistic = c("Chisq"))
## 
##  Pearson's X^2: Rao & Scott adjustment
## 
## data:  svychisq(~DMDCITZN + SDDSRVYR, svy.nomiss.adj, statistic = c("Chisq"))
## X-squared = 31.599, df = 7, p-value = 0.04774
svychisq( ~ DMDCITZN + RIAGENDR, svy.nomiss.adj, statistic = c("Chisq"))
## 
##  Pearson's X^2: Rao & Scott adjustment
## 
## data:  svychisq(~DMDCITZN + RIAGENDR, svy.nomiss.adj, statistic = c("Chisq"))
## X-squared = 8.3891, df = 1, p-value = 0.01068
svychisq( ~ DMDCITZN + RIDRETH1, svy.nomiss.adj, statistic = c("Chisq"))
## 
##  Pearson's X^2: Rao & Scott adjustment
## 
## data:  svychisq(~DMDCITZN + RIDRETH1, svy.nomiss.adj, statistic = c("Chisq"))
## X-squared = 410.09, df = 4, p-value < 2.2e-16
svychisq( ~ DMDCITZN + edu, svy.nomiss.adj, statistic = c("Chisq"))
## 
##  Pearson's X^2: Rao & Scott adjustment
## 
## data:  svychisq(~DMDCITZN + edu, svy.nomiss.adj, statistic = c("Chisq"))
## X-squared = 199.14, df = 4, p-value < 2.2e-16

Bioactivity of pesticides

toxcast_assays %>%
  select(chnm, casn, aenm) %>%
  filter(!is.na(chnm)) %>%
  distinct() %>%
  group_by(chnm, casn) %>%
  summarise(n=n()) %>%
  ungroup() %>%
  left_join(toxcast %>%
              select(chnm, casn, hitc, aenm) %>%
              filter(!is.na(chnm), hitc==1) %>%
              distinct() %>%
              group_by(chnm, casn) %>%
              summarise(pos=sum(hitc)) %>%
              ungroup()) %>%
  mutate(percent = (pos/n)*100) %>%
  left_join(toxcast %>%
              select(chnm, casn, chem_min_log, modl_acc, hitc) %>%
              filter(!is.na(chnm), hitc==1) %>%
              distinct() %>%
              group_by(chnm, casn) %>%
              summarise(threshold=exp(min(modl_acc))) %>%
              ungroup()) -> bio_threshold

kableExtra::kable(bio_threshold,
                  caption="Bioactivity of pesticides, by pesticide and persistence",
                  booktabs = T, align = c("l","l","r","r","r","r"),
                  col.names = c("Common Name", "CAS-RN", "Total Assays",
                                "Positive Assays", "Bio-active Assay  Percentage",
                                "Bioactivity Threshold (µM)")) %>%
  kableExtra::kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                            full_width = F)
Bioactivity of pesticides, by pesticide and persistence
Common Name CAS-RN Total Assays Positive Assays Bio-active Assay Percentage Bioactivity Threshold (µM)
2,4-Dichlorophenol 120-83-2 701 28 3.994294 0.3352516
2,4-Dichlorophenoxyacetic acid 94-75-7 836 23 2.751196 0.0064870
2,5-Dichlorophenol 583-78-8 626 13 2.076677 0.3297033
3,5,6-Trichloro-2-pyridinol 6515-38-4 573 30 5.235602 1.3512499
3-Phenoxybenzoic acid 3739-38-6 762 13 1.706037 0.3986559
4-Nitrophenol 100-02-7 705 48 6.808511 0.0086321
Chlorpyrifos 2921-88-2 662 124 18.731118 1.4547414
Chlorpyrifos oxon 5598-15-2 839 162 19.308701 0.0407919
DEET 134-62-3 797 17 2.132999 1.0457069
Dichlorodiphenyltrichloroethane 50-29-3 691 250 36.179450 0.4324775
Dieldrin 60-57-1 700 150 21.428571 0.3221553
Heptachlor 76-44-8 671 238 35.469449 1.3130273
beta-Hexachlorocyclohexane 319-85-7 675 28 4.148148 0.1545289
p,p’-DDE 72-55-9 703 230 32.716927 0.3124763

Data analysis

Non-parametric Wilcoxon-Mann-Whitney U

The output for this model is log molarity of the chemical (\(\frac{\mathrm{\mu mol}}{L}\)) for LBX data and log ratio of the molarity of the chemical with the log molarity of creatine (\(\frac{\mathrm{\mu mol}}{L}\)) for URX data.

options(survey.lonely.psu = "adjust")
options(survey.adjust.domain.lonely = TRUE)

chem_names <- as.character(chem_weight_names[,1])
weight_names <- as.character(chem_weight_names[,2])

desired_length <- length(chem_names)
datalist <- vector(mode = "list", length = desired_length)

# ROC equivalent
auc_wmw2 <- function(labels, scores){
  labels <- as.logical(labels)
  n1 <- sum(labels)
  n2 <- sum(!labels)
  R1 <- sum(rank(scores)[labels])
  U1 <- R1 - n1 * (n1 + 1)/2
  U1/(n1 * n2)
}

for(i in seq(1:desired_length)) {
  demo_dat %>%
    dplyr::select(!starts_with("WT_")) %>%
    left_join(chem_vars_for_analysis %>%
                filter(chemical==chem_names[i]) %>%
                select(-chemical)) -> temp_dat1
  demo_dat %>%
    dplyr::select(weight_names[i]) -> temp_dat2
  colnames(temp_dat2) <- c("WT")
  cbind(temp_dat1, temp_dat2) %>% filter(!is.na(WT)) %>%
    mutate(nomiss.fw.lb = !is.na(farmwork) & !is.na(molarity) & RIDAGEYR >= min_age & WT > 0,
           nomiss.cz.lb = !is.na(DMDCITZN) & !is.na(molarity) & RIDAGEYR >= min_age & WT > 0,
           nomiss.fw.ur = !is.na(farmwork) & !is.na(molarity_log_ratio) & RIDAGEYR >= min_age & WT > 0,
           nomiss.cz.ur = !is.na(DMDCITZN) & !is.na(molarity_log_ratio) &
             RIDAGEYR >= min_age & WT > 0) -> temp_dat
  
  svydesign(strata  = temp_dat$SDMVSTRA, id   = temp_dat$SDMVPSU,
            weights = temp_dat$WT,       data = temp_dat, nest=T) -> nhanes.svy
  
  if(grepl('^LB', chem_names[i])) {
    subset(nhanes.svy, nomiss.fw.lb) -> svy.nomiss.fw
    subset(nhanes.svy, nomiss.cz.lb) -> svy.nomiss.cz
    
    temp_dat %>% filter(nomiss.fw.lb) -> no.na.data.fw
    no.na.data.fw %>%
      mutate(molarity = log(molarity)) %>% pull(molarity) -> prediction_fw
  
    temp_dat %>% filter(nomiss.cz.lb) -> no.na.data.cz
    no.na.data.cz %>%
      mutate(molarity = log(molarity)) %>% pull(molarity) -> prediction_cz
    
    svyranktest(log(molarity) ~ farmwork, design = svy.nomiss.fw,
                na = TRUE,      test = c("wilcoxon")) -> fw
    
    svyranktest(log(molarity) ~ DMDCITZN, design = svy.nomiss.cz,
                na = TRUE,      test = c("wilcoxon")) -> cz
  } else {
    subset(nhanes.svy, nomiss.fw.ur) -> svy.nomiss.fw
    subset(nhanes.svy, nomiss.cz.ur) -> svy.nomiss.cz
    
    temp_dat %>% filter(nomiss.fw.ur) -> no.na.data.fw
    no.na.data.fw %>%
      pull(molarity_log_ratio) -> prediction_fw
  
    temp_dat %>% filter(nomiss.cz.ur) -> no.na.data.cz
    no.na.data.cz %>%
      pull(molarity_log_ratio) -> prediction_cz
    
    svyranktest(molarity_log_ratio ~ farmwork, design = svy.nomiss.fw,
                na = TRUE,      test = c("wilcoxon")) -> fw
    
    svyranktest(molarity_log_ratio ~ DMDCITZN, design = svy.nomiss.cz,
                na = TRUE,      test = c("wilcoxon")) -> cz
  }
  
  # Totals
  no.na.data.fw %>% dplyr::select(SDMVSTRA, SEQN) %>%
    group_by(SDMVSTRA) %>% summarise(n=as.numeric(n())) %>% ungroup()->Ntotal
  as.data.frame(as.matrix(Ntotal))->Ntotal
  round(svytable(~farmwork, svy.nomiss.fw, Ntotal=Ntotal)[2]) -> estNgroup
  no.na.data.fw %>% nrow() -> n
  
  no.na.data.cz %>% dplyr::select(SDMVSTRA, SEQN) %>%
    group_by(SDMVSTRA) %>% summarise(n=as.numeric(n()))->Ntotal2
  as.data.frame(as.matrix(Ntotal2))->Ntotal2
  round(svytable(~DMDCITZN, svy.nomiss.cz, Ntotal=Ntotal2)[2]) -> estNgroup2
  no.na.data.cz %>% nrow() -> n2

  # ROC
  no.na.data.fw %>%
    mutate(farmwork = as.numeric(farmwork), farmwork = ifelse(farmwork==1, 0, 1)) %>%
    pull(farmwork) -> category_fw
  auc_wmw2(category_fw, prediction_fw) -> auc_fw
  
  no.na.data.cz %>%
    mutate(DMDCITZN = as.numeric(DMDCITZN), DMDCITZN = ifelse(DMDCITZN==1, 0, 1)) %>%
    pull(DMDCITZN) -> category_cz
  auc_wmw2(category_cz, prediction_cz) -> auc_cz
  
  data.frame(chem     = c(chem_names[i],   chem_names[i]),
             variable = c("FarmWorker",    "NonCitizen"),
             weight   = c(weight_names[i], weight_names[i]),
             estNgroup= c(estNgroup,       estNgroup2),
             N        = c(n       ,        n2),
             t        = c(fw$statistic,    cz$statistic),
             df       = c(fw$parameter,    cz$parameter),
             ROC      = c(auc_fw,          auc_cz),
             p        = c(fw$p.value,      cz$p.value)
             ) -> data_rows
  
  datalist[[i]] <- data_rows
}
do.call(rbind, datalist) -> wilcoxon_results

# Format for table
wilcoxon_results %>%
  mutate(p.fdr = p.adjust(p, method="fdr"),
         p.fdr.sig = ifelse(p.fdr<0.05,"*","")) %>%
  mutate(p = if_else(p < 0.01,
                     formatC(p, digits = 4, format = "e"),
                     formatC(p, digits = 4, format = "f", drop0trailing = FALSE)),
         p.fdr = if_else(p.fdr < 0.01,
                     formatC(p.fdr, digits = 4, format = "e"),
                     formatC(p.fdr, digits = 4, format = "f", drop0trailing = FALSE)),
  ) %>%
  mutate_if(is.numeric, function(x) {
    formatC(x, digits = 4, format = "f", drop0trailing = FALSE)
  }) %>%
  mutate(estNgroup =as.integer(estNgroup),
         N =as.integer(N)) -> wilcoxon_table

Non-parametric Wilcoxon-Mann-Whitney U table

wilcoxon_table[-1] -> wilcoxon_table_sm

kableExtra::kable(wilcoxon_table_sm, caption="Wilcoxon-Mann-Whitney U", booktabs = T, align = c("l","l","r","r","r","r","r","l","l","r")) %>%
  kableExtra::kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F) %>%
  kableExtra::pack_rows("LBXBHC", 1, 2) %>%
  kableExtra::pack_rows("LBXDIE", 3, 4) %>%
  kableExtra::pack_rows("LBXHPE", 5, 6) %>%
  kableExtra::pack_rows("LBXPDE", 7, 8) %>%
  kableExtra::pack_rows("LBXPDT", 9, 10) %>%
  kableExtra::pack_rows("URX24D", 11, 12) %>%
  kableExtra::pack_rows("URXDEA", 13, 14) %>%
  kableExtra::pack_rows("URXOPM", 15, 16) %>%
  kableExtra::pack_rows("URXPAR", 17, 18) %>%
  kableExtra::pack_rows("URXCPM", 19, 20) %>%
  kableExtra::pack_rows("URX14D", 21, 22) %>%
  kableExtra::pack_rows("URXDCB", 23, 24)
Wilcoxon-Mann-Whitney U
variable weight estNgroup N t df ROC p p.fdr p.fdr.sig
LBXBHC
FarmWorker WT_LBXBHC 245 4375 1.9537 43.0000 0.5554 0.0573 0.0982
NonCitizen WT_LBXBHC 470 4605 11.9385 43.0000 0.6802 3.0611e-15 7.3467e-14
LBXDIE
FarmWorker WT_LBXDIE 182 2986 0.8631 29.0000 0.5288 0.3952 0.5269
NonCitizen WT_LBXDIE 326 3119 -3.3731 29.0000 0.3807 2.1244e-03 6.3731e-03
LBXHPE
FarmWorker WT_LBXHPE 237 4076 1.3726 43.0000 0.5375 0.1770 0.2499
NonCitizen WT_LBXHPE 441 4298 -3.7089 43.0000 0.4614 5.9220e-04 2.0304e-03
LBXPDE
FarmWorker WT_LBXPDE 250 4421 1.6139 43.0000 0.5519 0.1139 0.1708
NonCitizen WT_LBXPDE 473 4654 8.7163 43.0000 0.6941 4.7005e-11 5.6406e-10
LBXPDT
FarmWorker WT_LBXPDT 238 4161 -0.6466 43.0000 0.5222 0.5213 0.6256
NonCitizen WT_LBXPDT 447 4391 6.2875 43.0000 0.6934 1.3952e-07 8.3714e-07
URX24D
FarmWorker WT_URX24D 338 10813 1.9699 107.0000 0.5286 0.0514 0.0982
NonCitizen WT_URX24D 1254 11394 -2.9099 107.0000 0.4756 4.3989e-03 0.0117
URXDEA
FarmWorker WT_URXDEA 152 7156 1.7648 63.0000 0.5616 0.0824 0.1319
NonCitizen WT_URXDEA 820 7524 -2.2939 63.0000 0.4502 0.0251 0.0549
URXOPM
FarmWorker WT_URXOPM 256 9239 -0.1428 92.0000 0.4728 0.8867 0.9673
NonCitizen WT_URXOPM 1077 9721 -0.8146 92.0000 0.4850 0.4174 0.5273
URXPAR
FarmWorker WT_URXPAR 250 9192 -4.3490 92.0000 0.4139 3.5295e-05 1.4118e-04
NonCitizen WT_URXPAR 1078 9677 2.6750 92.0000 0.5265 8.8461e-03 0.0212
URXCPM
FarmWorker WT_URXCPM 212 7719 -0.0665 75.0000 0.5009 0.9471 0.9883
NonCitizen WT_URXCPM 903 8105 -1.9613 75.0000 0.4894 0.0536 0.0982
URX14D
FarmWorker WT_URX14D 279 10415 0.0051 93.0000 0.5061 0.9960 0.9960
NonCitizen WT_URX14D 1115 10958 5.7059 93.0000 0.5780 1.3668e-07 8.3714e-07
URXDCB
FarmWorker WT_URXDCB 279 10415 -0.5476 93.0000 0.5128 0.5853 0.6689
NonCitizen WT_URXDCB 1115 10958 5.0403 93.0000 0.5705 2.2812e-06 1.0950e-05

Logistic regression

Log molariry of creatine is added as an input variable for URX chemicals and left out for LBX chemicals.

options(survey.lonely.psu = "adjust")
options(survey.adjust.domain.lonely = TRUE)

suppressMessages(library(pROC))

datalist <- vector(mode = "list", length = desired_length)

# Logreg
for(i in seq(1:desired_length)) {
  demo_dat %>%
    dplyr::select(!starts_with("WT_")) %>%
    left_join(chem_vars_for_analysis %>%
                filter(chemical==chem_names[i]) %>%
                select(-chemical)) -> temp_dat1
  demo_dat %>%
    dplyr::select(weight_names[i]) -> temp_dat2
  colnames(temp_dat2) <- c("WT")
  cbind(temp_dat1, temp_dat2) %>% filter(!is.na(WT)) %>%
    mutate(nomiss.raw = !is.na(farmwork) & !is.na(DMDCITZN) &
             !is.na(indiv_chem_bioactiv) & RIDAGEYR >= min_age & WT > 0,
           nomiss.lb.adj = !is.na(farmwork) & !is.na(DMDCITZN) &
             !is.na(indiv_chem_bioactiv) & !is.na(BMXBMI) & !is.na(RIDAGEYR) &
             !is.na(SDDSRVYR) & !is.na(RIAGENDR) & !is.na(edu) &
             !is.na(RIDRETH1) & RIDAGEYR >= min_age & WT > 0,
           nomiss.ur.adj = !is.na(farmwork) & !is.na(DMDCITZN) &
             !is.na(indiv_chem_bioactiv) & !is.na(BMXBMI) & !is.na(RIDAGEYR) &
             !is.na(SDDSRVYR) & !is.na(RIAGENDR) & !is.na(edu) &
             !is.na(RIDRETH1) & !is.na(URXUCR_umol) & RIDAGEYR >= min_age &
             WT > 0) -> temp_dat
  
  svydesign(strata  = temp_dat$SDMVSTRA, id   = temp_dat$SDMVPSU,
            weights = temp_dat$WT,       data = temp_dat, nest=T) -> nhanes.svy
  
  subset(nhanes.svy, nomiss.raw) -> svy.nomiss.raw
  
  temp_dat %>% filter(nomiss.raw) -> no.na.data
  
  sum(no.na.data$indiv_chem_bioactiv)/length(no.na.data$indiv_chem_bioactiv) -> outcomes1
  
  if(grepl('^LB', chem_names[i])){
    subset(nhanes.svy, nomiss.lb.adj) -> svy.nomiss.adj
    
    temp_dat %>% filter(nomiss.lb.adj) -> no.na.data2
  } else {
    subset(nhanes.svy, nomiss.ur.adj) -> svy.nomiss.adj
    
    temp_dat %>% filter(nomiss.ur.adj) -> no.na.data2
  }
  
  #no.na.data2 %>% select(indiv_chem_bioactiv) %>% distinct() %>% nrow() -> outcomes2
  sum(no.na.data2$indiv_chem_bioactiv)/length(no.na.data2$indiv_chem_bioactiv) -> outcomes2
  
  no.na.data %>% dplyr::select(SDMVSTRA, SEQN) %>%
    group_by(SDMVSTRA) %>% summarise(n=as.numeric(n()))->Ntotal
  as.data.frame(as.matrix(Ntotal))->Ntotal
  round(svytable(~farmwork, svy.nomiss.raw, Ntotal=Ntotal)[2]) -> estNgroup
  no.na.data %>% nrow() -> n
  
  no.na.data2 %>% dplyr::select(SDMVSTRA, SEQN) %>%
    group_by(SDMVSTRA) %>% summarise(n=as.numeric(n()))->Ntotal2
  as.data.frame(as.matrix(Ntotal2))->Ntotal2
  round(svytable(~farmwork, svy.nomiss.adj, Ntotal=Ntotal2)[2]) -> estNgroup2
  no.na.data2 %>% nrow() -> n2
  
  if (min(outcomes1,outcomes2)<0.015) {
      data.frame(chem     = c(chem_names[i]),
                 model    = c("Unadjusted",    "Adjusted"),
                 variable = c("FarmWorker",    "FarmWorker",
                              "NonCitizen",    "NonCitizen"),
                 weight   = c(weight_names[i]),
                 estNgroup= c(estNgroup,       estNgroup2),
                 N        = c(n       ,        n2),
                 OR       = c(NA_real_,        NA_real_),
                 CI_2.5   = c(NA_real_,        NA_real_),
                 CI_97.5  = c(NA_real_,        NA_real_),
                 t        = c(NA_real_,        NA_real_),
                 #maxVIF   = c(NA_real_,        NA_real_),
                 ROC      = c(NA_real_,        NA_real_),
                 p        = c(NA_real_,        NA_real_)
                 ) -> data_rows
    } else {
      
      #model included
      svyglm(indiv_chem_bioactiv ~ farmwork + DMDCITZN, design=svy.nomiss.raw,
             family = quasibinomial(link = "logit")) -> fw_raw
      
      #model included
      if(grepl('^LB', chem_names[i])){
        svyglm(indiv_chem_bioactiv ~ farmwork + DMDCITZN + BMXBMI +
                 RIDAGEYR + SDDSRVYR + RIAGENDR + edu + RIDRETH1,
               design = svy.nomiss.adj, family = quasibinomial(link = "logit")) -> fw_adj
      } else {
        svyglm(indiv_chem_bioactiv ~ farmwork + DMDCITZN + BMXBMI +
                 RIDAGEYR + SDDSRVYR + RIAGENDR + edu + RIDRETH1 + log(URXUCR_umol),
               design = svy.nomiss.adj, family = quasibinomial(link = "logit")) -> fw_adj
      }
      
      # Max VIF
      #max(summ(fw_adj, vifs=TRUE)$coeftable[,5], na.rm=TRUE) -> vif_fw_adj
      
      # Discrimination (ROC)
      tp.fp_raw <- WeightedROC(fitted(fw_raw),svy.nomiss.raw$variables$indiv_chem_bioactiv, 
                               no.na.data$WT)
      WeightedAUC(tp.fp_raw) -> roc_raw
      tp.fp_adj <- WeightedROC(fitted(fw_adj),svy.nomiss.adj$variables$indiv_chem_bioactiv, 
                               no.na.data2$WT)
      WeightedAUC(tp.fp_adj) -> roc_adj
      
        data.frame(chem     = c(chem_names[i]),
                   model    = c("Unadjusted",    "Adjusted"),
                   variable = c("FarmWorker",    "FarmWorker",
                                "NonCitizen",    "NonCitizen"),
                   weight   = c(weight_names[i]),
                   estNgroup= c(estNgroup,       estNgroup2),
                   N        = c(n       ,        n2),
                   OR       = c(exp(coefficients(fw_raw)[2]),
                                exp(coefficients(fw_adj)[2]),
                                exp(coefficients(fw_raw)[3]),
                                exp(coefficients(fw_adj)[3])),
                   CI_2.5   = c(exp(confint(fw_raw, ddf=degf(fw_raw$survey.design)))[2,1],
                                exp(confint(fw_adj, ddf=degf(fw_adj$survey.design)))[2,1],
                                exp(confint(fw_raw, ddf=degf(fw_raw$survey.design)))[3,1],
                                exp(confint(fw_adj, ddf=degf(fw_adj$survey.design)))[3,1]),
                   CI_97.5  = c(exp(confint(fw_raw, ddf=degf(fw_raw$survey.design)))[2,2],
                                exp(confint(fw_adj, ddf=degf(fw_adj$survey.design)))[2,2],
                                exp(confint(fw_raw, ddf=degf(fw_raw$survey.design)))[3,2],
                                exp(confint(fw_adj, ddf=degf(fw_adj$survey.design)))[3,2]),
                   t        = c(summary(fw_raw, df.resid = degf(fw_raw$survey.design))$coef[2,3],
                                summary(fw_adj, df.resid = degf(fw_adj$survey.design))$coef[2,3],
                                summary(fw_raw, df.resid = degf(fw_raw$survey.design))$coef[3,3],
                                summary(fw_adj, df.resid = degf(fw_adj$survey.design))$coef[3,3]),
                   #maxVIF   = c(NA_real_, vif_fw_adj),
                   ROC      = c(roc_raw,
                                roc_adj),
                   p        = c(summary(fw_raw)$coefficients[2,4],
                                summary(fw_adj)$coefficients[2,4],
                                summary(fw_raw)$coefficients[3,4],
                                summary(fw_adj)$coefficients[3,4])
                   ) -> data_rows
      }

  datalist[[i]] <- data_rows
  #print(i)
}
do.call(rbind, datalist) -> logreg_results_fw

# Organize into unadjusted and adjusted tables
logreg_results_fw %>%
  filter(model == "Unadjusted") -> logreg_results_raw
logreg_results_fw %>%
  filter(model == "Adjusted") -> logreg_results_adj


# Format for tables
logreg_results_raw %>%
  mutate(p.fdr = p.adjust(p, method="fdr"),
         p.fdr.sig = ifelse(p.fdr<0.05,"*","")) %>%
  mutate(OR = if_else(OR < 0.0001,
                     formatC(OR, digits = 4, format = "e"),
                     formatC(OR, digits = 4, format = "f", drop0trailing = FALSE)),
         CI_2.5 = if_else(CI_2.5 < 0.0001,
                     formatC(CI_2.5, digits = 4, format = "e"),
                     formatC(CI_2.5, digits = 4, format = "f", drop0trailing = FALSE)),
         CI_97.5 = if_else(CI_97.5 < 0.0001,
                     formatC(CI_97.5, digits = 4, format = "e"),
                     formatC(CI_97.5, digits = 4, format = "f", drop0trailing = FALSE)),
         p = if_else(p < 0.01,
                     formatC(p, digits = 4, format = "e"),
                     formatC(p, digits = 4, format = "f", drop0trailing = FALSE)),
         p.fdr = if_else(p.fdr < 0.01,
                         formatC(p.fdr, digits = 4, format = "e"),
                         formatC(p.fdr, digits = 4, format = "f", drop0trailing = FALSE)),
  ) %>%
  mutate_if(is.numeric, function(x) {
    formatC(x, digits = 4, format = "f", drop0trailing = FALSE)
  }) %>%
  mutate(estNgroup =as.integer(estNgroup),
         N =as.integer(N)) -> logreg_table_raw

logreg_results_adj %>%
  mutate(p.fdr = p.adjust(p, method="fdr"),
         p.fdr.sig = ifelse(p.fdr<0.05,"*","")) %>%
  mutate(OR = if_else(OR < 0.0001,
                     formatC(OR, digits = 4, format = "e"),
                     formatC(OR, digits = 4, format = "f", drop0trailing = FALSE)),
         CI_2.5 = if_else(CI_2.5 < 0.0001,
                     formatC(CI_2.5, digits = 4, format = "e"),
                     formatC(CI_2.5, digits = 4, format = "f", drop0trailing = FALSE)),
         CI_97.5 = if_else(CI_97.5 < 0.0001,
                     formatC(CI_97.5, digits = 4, format = "e"),
                     formatC(CI_97.5, digits = 4, format = "f", drop0trailing = FALSE)),
         p = if_else(p < 0.01,
                     formatC(p, digits = 4, format = "e"),
                     formatC(p, digits = 4, format = "f", drop0trailing = FALSE)),
         p.fdr = if_else(p.fdr < 0.01,
                         formatC(p.fdr, digits = 4, format = "e"),
                         formatC(p.fdr, digits = 4, format = "f", drop0trailing = FALSE)),
  ) %>%
  mutate_if(is.numeric, function(x) {
    formatC(x, digits = 4, format = "f", drop0trailing = FALSE)
  }) %>%
  mutate(estNgroup =as.integer(estNgroup),
         N =as.integer(N)) -> logreg_table_adj

Logistic regression table (unadjusted)

logreg_table_raw[-c(1:2)] -> logreg_table_raw_sm

# Unadjusted table
kableExtra::kable(logreg_table_raw_sm,caption="Unadjusted Log Reg", booktabs = T, align = c("l","l","r","r","r","r","r","r","r","l","l","r")) %>%
  kableExtra::kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F) %>%
  kableExtra::pack_rows("LBXBHC", 1, 2) %>%
  kableExtra::pack_rows("LBXDIE", 3, 4) %>%
  kableExtra::pack_rows("LBXHPE", 5, 6) %>%
  kableExtra::pack_rows("LBXPDE", 7, 8) %>%
  kableExtra::pack_rows("LBXPDT", 9, 10) %>%
  kableExtra::pack_rows("URX24D", 11, 12) %>%
  kableExtra::pack_rows("URXDEA", 13, 14) %>%
  kableExtra::pack_rows("URXOPM", 15, 16) %>%
  kableExtra::pack_rows("URXPAR", 17, 18) %>%
  kableExtra::pack_rows("URXCPM", 19, 20) %>%
  #kableExtra::pack_rows("URXCPF", 21, 22) %>%
  #kableExtra::pack_rows("URXCPO", 23, 24) %>%
  kableExtra::pack_rows("URX14D", 21, 22) %>%
  kableExtra::pack_rows("URXDCB", 23, 24)
Unadjusted Log Reg
variable weight estNgroup N OR CI_2.5 CI_97.5 t ROC p p.fdr p.fdr.sig
LBXBHC
FarmWorker WT_LBXBHC 245 4366 1.4472 0.8724 2.4006 1.4720 0.5829 0.1485 0.2376
NonCitizen WT_LBXBHC 245 4366 3.7175 2.6420 5.2307 7.7488 0.5829 1.2589e-09 2.0142e-08
LBXDIE
FarmWorker WT_LBXDIE 182 2984 NA NA NA NA NA NA NA NA
NonCitizen WT_LBXDIE 182 2984 NA NA NA NA NA NA NA NA
LBXHPE
FarmWorker WT_LBXHPE 237 4068 NA NA NA NA NA NA NA NA
NonCitizen WT_LBXHPE 237 4068 NA NA NA NA NA NA NA NA
LBXPDE
FarmWorker WT_LBXPDE 250 4411 1.2505 0.7976 1.9603 1.0019 0.5346 0.3221 0.4295
NonCitizen WT_LBXPDE 250 4411 3.4383 1.9706 5.9991 4.4713 0.5346 5.8052e-05 1.8577e-04
LBXPDT
FarmWorker WT_LBXPDT 239 4153 1.1619 0.3809 3.5442 0.2711 0.8022 0.7876 0.8401
NonCitizen WT_LBXPDT 239 4153 26.8738 10.6009 68.1264 7.1305 0.8022 9.4640e-09 7.5712e-08
URX24D
FarmWorker WT_URX24D 339 10800 3.8588 2.4026 6.1974 5.6495 0.5372 1.3648e-07 5.4590e-07
NonCitizen WT_URX24D 339 10800 0.9425 0.6557 1.3548 -0.3234 0.5372 0.7470 0.8401
URXDEA
FarmWorker WT_URXDEA 152 7149 3.6107 1.2510 10.4210 2.4199 0.5414 0.0185 0.0352
NonCitizen WT_URXDEA 152 7149 0.6351 0.2875 1.4028 -1.1444 0.5414 0.2568 0.3736
URXOPM
FarmWorker WT_URXOPM 256 9225 NA NA NA NA NA NA NA NA
NonCitizen WT_URXOPM 256 9225 NA NA NA NA NA NA NA NA
URXPAR
FarmWorker WT_URXPAR 250 9178 0.5923 0.4119 0.8518 -2.8628 0.5154 5.2115e-03 0.0119
NonCitizen WT_URXPAR 250 9178 1.2657 1.0391 1.5418 2.3715 0.5154 0.0198 0.0352
URXCPM
FarmWorker WT_URXCPM 212 7707 NA NA NA NA NA NA NA NA
NonCitizen WT_URXCPM 212 7707 NA NA NA NA NA NA NA NA
URX14D
FarmWorker WT_URX14D 278 10396 1.0107 0.7454 1.3703 0.0691 0.5339 0.9450 0.9450
NonCitizen WT_URX14D 278 10396 2.0563 1.6284 2.5967 6.1345 0.5339 2.1194e-08 1.1304e-07
URXDCB
FarmWorker WT_URXDCB 278 10396 0.7622 0.4142 1.4025 -0.8842 0.5537 0.3789 0.4663
NonCitizen WT_URXDCB 278 10396 2.6119 1.5329 4.4504 3.5772 0.5537 5.5630e-04 1.4835e-03

Logistic regression table (adjusted)

logreg_table_adj[-c(1:2)] -> logreg_table_adj_sm

# Adjusted table
kableExtra::kable(logreg_table_adj_sm,caption="Adjusted Log Reg", booktabs = T, align = c("l","l","r","r","r","r","r","r","r","l","l","r")) %>%
  kableExtra::kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F) %>%
  kableExtra::pack_rows("LBXBHC", 1, 2) %>%
  kableExtra::pack_rows("LBXDIE", 3, 4) %>%
  kableExtra::pack_rows("LBXHPE", 5, 6) %>%
  kableExtra::pack_rows("LBXPDE", 7, 8) %>%
  kableExtra::pack_rows("LBXPDT", 9, 10) %>%
  kableExtra::pack_rows("URX24D", 11, 12) %>%
  kableExtra::pack_rows("URXDEA", 13, 14) %>%
  kableExtra::pack_rows("URXOPM", 15, 16) %>%
  kableExtra::pack_rows("URXPAR", 17, 18) %>%
  kableExtra::pack_rows("URXCPM", 19, 20) %>%
  #kableExtra::pack_rows("URXCPF", 21, 22) %>%
  #kableExtra::pack_rows("URXCPO", 23, 24) %>%
  kableExtra::pack_rows("URX14D", 21, 22) %>%
  kableExtra::pack_rows("URXDCB", 23, 24)
Adjusted Log Reg
variable weight estNgroup N OR CI_2.5 CI_97.5 t ROC p p.fdr p.fdr.sig
LBXBHC
FarmWorker WT_LBXBHC 238 4257 1.4356 0.8504 2.4236 1.3918 0.8689 0.1746 0.2793
NonCitizen WT_LBXBHC 238 4257 8.1014 4.4360 14.7957 7.0003 0.8689 1.0693e-07 1.3293e-06
LBXDIE
FarmWorker WT_LBXDIE 175 2893 NA NA NA NA NA NA NA NA
NonCitizen WT_LBXDIE 175 2893 NA NA NA NA NA NA NA NA
LBXHPE
FarmWorker WT_LBXHPE 229 3965 NA NA NA NA NA NA NA NA
NonCitizen WT_LBXHPE 229 3965 NA NA NA NA NA NA NA NA
LBXPDE
FarmWorker WT_LBXPDE 242 4302 0.9775 0.6491 1.4720 -0.1121 0.7954 0.9115 0.9115
NonCitizen WT_LBXPDE 242 4302 2.5968 1.3079 5.1560 2.8041 0.7954 8.9103e-03 0.0238
LBXPDT
FarmWorker WT_LBXPDT 231 4047 1.1221 0.3798 3.3155 0.2144 0.9152 0.8318 0.9115
NonCitizen WT_LBXPDT 231 4047 7.7490 2.2183 27.0690 3.2991 0.9152 2.5729e-03 0.0103
URX24D
FarmWorker WT_URX24D 330 10652 3.7574 2.3687 5.9602 5.6869 0.7338 1.6616e-07 1.3293e-06
NonCitizen WT_URX24D 330 10652 1.2687 0.8042 2.0016 1.0347 0.7338 0.3037 0.4049
URXDEA
FarmWorker WT_URXDEA 151 7073 3.5516 1.3165 9.5813 2.5513 0.7138 0.0140 0.0321
NonCitizen WT_URXDEA 151 7073 0.9361 0.4280 2.0471 -0.1687 0.7138 0.8668 0.9115
URXOPM
FarmWorker WT_URXOPM 250 9108 NA NA NA NA NA NA NA NA
NonCitizen WT_URXOPM 250 9108 NA NA NA NA NA NA NA NA
URXPAR
FarmWorker WT_URXPAR 244 9064 0.5364 0.3622 0.7944 -3.1494 0.7681 2.3604e-03 0.0103
NonCitizen WT_URXPAR 244 9064 1.2780 1.0055 1.6243 2.0313 0.7681 0.0458 0.0916
URXCPM
FarmWorker WT_URXCPM 206 7610 NA NA NA NA NA NA NA NA
NonCitizen WT_URXCPM 206 7610 NA NA NA NA NA NA NA NA
URX14D
FarmWorker WT_URX14D 274 10256 0.9522 0.7134 1.2709 -0.3368 0.7397 0.7372 0.9074
NonCitizen WT_URX14D 274 10256 1.4771 1.1226 1.9434 2.8226 0.7397 6.0954e-03 0.0195
URXDCB
FarmWorker WT_URXDCB 274 10256 0.6979 0.3685 1.3218 -1.1183 0.7938 0.2670 0.3884
NonCitizen WT_URXDCB 274 10256 1.7494 0.8119 3.7698 1.4465 0.7938 0.1522 0.2706

Linear regression

options(survey.lonely.psu = "adjust")
options(survey.adjust.domain.lonely = TRUE)

suppressMessages(library(pROC))

datalist <- vector(mode = "list", length = desired_length)

# Linreg
for(i in seq(1:desired_length)) {
  demo_dat %>%
    dplyr::select(!starts_with("WT_")) %>%
    left_join(chem_vars_for_analysis %>%
                filter(chemical==chem_names[i]) %>%
                select(-chemical)) -> temp_dat1
  demo_dat %>%
    dplyr::select(weight_names[i]) -> temp_dat2
  colnames(temp_dat2) <- c("WT")
  cbind(temp_dat1, temp_dat2) %>% filter(!is.na(WT)) %>%
    mutate(nomiss.raw = !is.na(farmwork) & !is.na(DMDCITZN) &
             !is.na(molarity) & RIDAGEYR >= min_age & WT > 0,
           nomiss.lb.adj = !is.na(farmwork) & !is.na(DMDCITZN) &
             !is.na(molarity) & !is.na(BMXBMI) & !is.na(RIDAGEYR) &
             !is.na(SDDSRVYR) & !is.na(RIAGENDR) & !is.na(edu) &
             !is.na(RIDRETH1) & RIDAGEYR >= min_age & WT > 0,
           nomiss.ur.adj = !is.na(farmwork) & !is.na(DMDCITZN) &
             !is.na(molarity) & !is.na(BMXBMI) & !is.na(RIDAGEYR) &
             !is.na(SDDSRVYR) & !is.na(RIAGENDR) & !is.na(edu) &
             !is.na(RIDRETH1) & !is.na(URXUCR_umol) & RIDAGEYR >= min_age &
             WT > 0) -> temp_dat
  
  svydesign(strata  = temp_dat$SDMVSTRA, id   = temp_dat$SDMVPSU,
            weights = temp_dat$WT,       data = temp_dat, nest=T) -> nhanes.svy
  
  subset(nhanes.svy, nomiss.raw) -> svy.nomiss.raw
  
  temp_dat %>% filter(nomiss.raw) -> no.na.data
  
  if(grepl('^LB', chem_names[i])){
    subset(nhanes.svy, nomiss.lb.adj) -> svy.nomiss.adj
    
    temp_dat %>% filter(nomiss.lb.adj) -> no.na.data2
  } else {
    subset(nhanes.svy, nomiss.ur.adj) -> svy.nomiss.adj
    
    temp_dat %>% filter(nomiss.ur.adj) -> no.na.data2
  }
  
  no.na.data %>% dplyr::select(SDMVSTRA, SEQN) %>%
    group_by(SDMVSTRA) %>% summarise(n=as.numeric(n()))->Ntotal
  as.data.frame(as.matrix(Ntotal))->Ntotal
  round(svytable(~farmwork, svy.nomiss.raw, Ntotal=Ntotal)[2]) -> estNgroup
  no.na.data %>% nrow() -> n
  
  no.na.data2 %>% dplyr::select(SDMVSTRA, SEQN) %>%
    group_by(SDMVSTRA) %>% summarise(n=as.numeric(n()))->Ntotal2
  as.data.frame(as.matrix(Ntotal2))->Ntotal2
  round(svytable(~farmwork, svy.nomiss.adj, Ntotal=Ntotal2)[2]) -> estNgroup2
  no.na.data2 %>% nrow() -> n2
      
      #model included
      svyglm(molarity ~ farmwork + DMDCITZN, design=svy.nomiss.raw,
             family = quasi(link = "log", variance = "mu")) -> fw_raw
      
      #model included
      if(grepl('^LB', chem_names[i])){
        svyglm(molarity ~ farmwork + DMDCITZN + BMXBMI + RIDAGEYR + SDDSRVYR +
                 RIAGENDR + edu + RIDRETH1,
               design = svy.nomiss.adj, family = quasi(link = "log", variance = "mu")) -> fw_adj
      } else {
        svyglm(molarity ~ farmwork + DMDCITZN + BMXBMI + RIDAGEYR + SDDSRVYR +
                 RIAGENDR + edu + RIDRETH1 + log(URXUCR_umol),
               design = svy.nomiss.adj, family = quasi(link = "log", variance = "mu")) -> fw_adj
      }

      # Max VIF
      #max(summ(fw_adj, vifs=TRUE)$coeftable[,5], na.rm=TRUE) -> vif_fw_adj
      
      # R2
      r2_raw = (fw_raw$null.deviance - fw_raw$deviance) / fw_raw$null.deviance
      r2_adj = (fw_adj$null.deviance - fw_adj$deviance) / fw_adj$null.deviance
      
        data.frame(chem     = c(chem_names[i]),
                   model    = c("Unadjusted",    "Adjusted"),
                   variable = c("FarmWorker",    "FarmWorker",
                                "NonCitizen",    "NonCitizen"),
                   weight   = c(weight_names[i]),
                   estNgroup= c(estNgroup,       estNgroup2),
                   N        = c(n       ,        n2),
                   beta     = c(coefficients(fw_raw)[2],
                                coefficients(fw_adj)[2],
                                coefficients(fw_raw)[3],
                                coefficients(fw_adj)[3]),
                   CI_2.5   = c(confint(fw_raw, ddf=degf(fw_raw$survey.design))[2,1],
                                confint(fw_adj, ddf=degf(fw_adj$survey.design))[2,1],
                                confint(fw_raw, ddf=degf(fw_raw$survey.design))[3,1],
                                confint(fw_adj, ddf=degf(fw_adj$survey.design))[3,1]),
                   CI_97.5  = c(confint(fw_raw, ddf=degf(fw_raw$survey.design))[2,2],
                                confint(fw_adj, ddf=degf(fw_adj$survey.design))[2,2],
                                confint(fw_raw, ddf=degf(fw_raw$survey.design))[3,2],
                                confint(fw_adj, ddf=degf(fw_adj$survey.design))[3,2]),
                   t        = c(summary(fw_raw, df.resid = degf(fw_raw$survey.design))$coef[2,3],
                                summary(fw_adj, df.resid = degf(fw_adj$survey.design))$coef[2,3],
                                summary(fw_raw, df.resid = degf(fw_raw$survey.design))$coef[3,3],
                                summary(fw_adj, df.resid = degf(fw_adj$survey.design))$coef[3,3]),
                   #maxVIF   = c(NA_real_, vif_fw_adj),
                   r2       = c(r2_raw,
                                r2_adj),
                   p        = c(summary(fw_raw)$coefficients[2,4],
                                summary(fw_adj)$coefficients[2,4],
                                summary(fw_raw)$coefficients[3,4],
                                summary(fw_adj)$coefficients[3,4])
                   ) -> data_rows

  datalist[[i]] <- data_rows
  #print(i)
}
do.call(rbind, datalist) -> linreg_results_fw

# Organize into unadjusted and adjusted tables
linreg_results_fw %>%
  filter(model == "Unadjusted") -> linreg_results_raw
linreg_results_fw %>%
  filter(model == "Adjusted") -> linreg_results_adj


# Format for tables
linreg_results_raw %>%
  mutate(p.fdr = p.adjust(p, method="fdr"),
         p.fdr.sig = ifelse(p.fdr<0.05,"*","")) %>%
  mutate(beta = if_else(abs(beta) < 0.0001,
                     formatC(beta, digits = 4, format = "e"),
                     formatC(beta, digits = 4, format = "f", drop0trailing = FALSE)),
         CI_2.5 = if_else(abs(CI_2.5) < 0.0001,
                     formatC(CI_2.5, digits = 4, format = "e"),
                     formatC(CI_2.5, digits = 4, format = "f", drop0trailing = FALSE)),
         CI_97.5 = if_else(abs(CI_97.5) < 0.0001,
                     formatC(CI_97.5, digits = 4, format = "e"),
                     formatC(CI_97.5, digits = 4, format = "f", drop0trailing = FALSE)),
         p = if_else(p < 0.01,
                     formatC(p, digits = 4, format = "e"),
                     formatC(p, digits = 4, format = "f", drop0trailing = FALSE)),
         p.fdr = if_else(p.fdr < 0.01,
                         formatC(p.fdr, digits = 4, format = "e"),
                         formatC(p.fdr, digits = 4, format = "f", drop0trailing = FALSE)),
  ) %>%
  mutate_if(is.numeric, function(x) {
    formatC(x, digits = 4, format = "f", drop0trailing = FALSE)
  }) %>%
  mutate(estNgroup =as.integer(estNgroup),
         N =as.integer(N)) -> linreg_table_raw

linreg_results_adj %>%
  mutate(p.fdr = p.adjust(p, method="fdr"),
         p.fdr.sig = ifelse(p.fdr<0.05,"*","")) %>%
  mutate(beta = if_else(abs(beta) < 0.0001,
                     formatC(beta, digits = 4, format = "e"),
                     formatC(beta, digits = 4, format = "f", drop0trailing = FALSE)),
         CI_2.5 = if_else(abs(CI_2.5) < 0.0001,
                     formatC(CI_2.5, digits = 4, format = "e"),
                     formatC(CI_2.5, digits = 4, format = "f", drop0trailing = FALSE)),
         CI_97.5 = if_else(abs(CI_97.5) < 0.0001,
                     formatC(CI_97.5, digits = 4, format = "e"),
                     formatC(CI_97.5, digits = 4, format = "f", drop0trailing = FALSE)),
         p = if_else(p < 0.01,
                     formatC(p, digits = 4, format = "e"),
                     formatC(p, digits = 4, format = "f", drop0trailing = FALSE)),
         p.fdr = if_else(p.fdr < 0.01,
                         formatC(p.fdr, digits = 4, format = "e"),
                         formatC(p.fdr, digits = 4, format = "f", drop0trailing = FALSE)),
  ) %>%
  mutate_if(is.numeric, function(x) {
    formatC(x, digits = 4, format = "f", drop0trailing = FALSE)
  }) %>%
  mutate(estNgroup =as.integer(estNgroup),
         N =as.integer(N)) -> linreg_table_adj

Linear regression table (unadjusted)

linreg_table_raw[-c(1:2)] -> linreg_table_raw_sm

# Unadjusted table
kableExtra::kable(linreg_table_raw_sm,caption="Unadjusted Lin Reg", booktabs = T, align = c("l","l","r","r","r","r","r","r","r","l","l","r")) %>%
  kableExtra::kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F) %>%
  kableExtra::pack_rows("LBXBHC", 1, 2) %>%
  kableExtra::pack_rows("LBXDIE", 3, 4) %>%
  kableExtra::pack_rows("LBXHPE", 5, 6) %>%
  kableExtra::pack_rows("LBXPDE", 7, 8) %>%
  kableExtra::pack_rows("LBXPDT", 9, 10) %>%
  kableExtra::pack_rows("URX24D", 11, 12) %>%
  kableExtra::pack_rows("URXDEA", 13, 14) %>%
  kableExtra::pack_rows("URXOPM", 15, 16) %>%
  kableExtra::pack_rows("URXPAR", 17, 18) %>%
  kableExtra::pack_rows("URXCPM", 19, 20) %>%
  #kableExtra::pack_rows("URXCPF", 21, 22) %>%
  #kableExtra::pack_rows("URXCPO", 23, 24) %>%
  kableExtra::pack_rows("URX14D", 21, 22) %>%
  kableExtra::pack_rows("URXDCB", 23, 24)
Unadjusted Lin Reg
variable weight estNgroup N beta CI_2.5 CI_97.5 t r2 p p.fdr p.fdr.sig
LBXBHC
FarmWorker WT_LBXBHC 245 4366 -0.0455 -0.4533 0.3623 -0.2247 0.0556 0.8233 0.8432
NonCitizen WT_LBXBHC 245 4366 1.0950 0.6086 1.5813 4.5377 0.0556 4.7029e-05 2.8217e-04
LBXDIE
FarmWorker WT_LBXDIE 182 2984 0.0740 -0.2314 0.3793 0.4946 0.0073 0.6247 0.7497
NonCitizen WT_LBXDIE 182 2984 -0.3345 -0.5460 -0.1229 -3.2286 0.0073 3.1667e-03 7.6002e-03
LBXHPE
FarmWorker WT_LBXHPE 237 4068 0.0928 -0.0574 0.2429 1.2446 0.0069 0.2202 0.3108
NonCitizen WT_LBXHPE 237 4068 -0.2887 -0.4411 -0.1362 -3.8151 0.0069 4.4033e-04 1.7613e-03
LBXPDE
FarmWorker WT_LBXPDE 250 4411 0.2465 -0.0372 0.5302 1.7509 0.1143 0.0873 0.1396
NonCitizen WT_LBXPDE 250 4411 1.1656 0.9143 1.4170 9.3458 0.1143 8.1256e-12 9.7508e-11
LBXPDT
FarmWorker WT_LBXPDT 239 4153 0.0723 -0.3683 0.5129 0.3307 0.2185 0.7425 0.8432
NonCitizen WT_LBXPDT 239 4153 1.7398 1.4287 2.0509 11.2692 0.2185 2.8145e-14 6.7548e-13
URX24D
FarmWorker WT_URX24D 339 10800 0.4771 -0.2793 1.2335 1.2502 0.0054 0.2140 0.3108
NonCitizen WT_URX24D 339 10800 -0.6281 -1.2395 -0.0167 -2.0362 0.0054 0.0442 0.0817
URXDEA
FarmWorker WT_URXDEA 152 7149 -0.8173 -2.7007 1.0661 -0.8669 0.0088 0.3893 0.5191
NonCitizen WT_URXDEA 152 7149 -1.8731 -3.7604 0.0141 -1.9828 0.0088 0.0518 0.0888
URXOPM
FarmWorker WT_URXOPM 256 9225 -0.0388 -0.4269 0.3494 -0.1983 0.0034 0.8432 0.8432
NonCitizen WT_URXOPM 256 9225 -0.4169 -0.6707 -0.1630 -3.2609 0.0034 1.5634e-03 5.3603e-03
URXPAR
FarmWorker WT_URXPAR 250 9178 -0.5192 -0.6838 -0.3547 -6.2659 0.0046 1.2123e-08 9.6987e-08
NonCitizen WT_URXPAR 250 9178 0.1758 0.0361 0.3155 2.4981 0.0046 0.0143 0.0286
URXCPM
FarmWorker WT_URXCPM 212 7707 -0.0281 -0.2763 0.2200 -0.2258 0.0003 0.8220 0.8432
NonCitizen WT_URXCPM 212 7707 -0.0615 -0.2130 0.0900 -0.8088 0.0003 0.4212 0.5321
URX14D
FarmWorker WT_URX14D 278 10396 -0.4899 -0.8296 -0.1503 -2.8641 0.0168 5.1798e-03 0.0113
NonCitizen WT_URX14D 278 10396 0.8829 0.4399 1.3259 3.9569 0.0168 1.4924e-04 7.1633e-04
URXDCB
FarmWorker WT_URXDCB 278 10396 -0.4450 -0.7328 -0.1572 -3.0697 0.0082 2.8162e-03 7.5100e-03
NonCitizen WT_URXDCB 278 10396 0.5588 0.2047 0.9130 3.1333 0.0082 2.3191e-03 6.9574e-03

Linear regression table (adjusted)

linreg_table_adj[-c(1:2)] -> linreg_table_adj_sm

# Adjusted table
kableExtra::kable(linreg_table_adj_sm,caption="Adjusted Lin Reg", booktabs = T, align = c("l","l","r","r","r","r","r","r","r","l","l","r")) %>%
  kableExtra::kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F) %>%
  kableExtra::pack_rows("LBXBHC", 1, 2) %>%
  kableExtra::pack_rows("LBXDIE", 3, 4) %>%
  kableExtra::pack_rows("LBXHPE", 5, 6) %>%
  kableExtra::pack_rows("LBXPDE", 7, 8) %>%
  kableExtra::pack_rows("LBXPDT", 9, 10) %>%
  kableExtra::pack_rows("URX24D", 11, 12) %>%
  kableExtra::pack_rows("URXDEA", 13, 14) %>%
  kableExtra::pack_rows("URXOPM", 15, 16) %>%
  kableExtra::pack_rows("URXPAR", 17, 18) %>%
  kableExtra::pack_rows("URXCPM", 19, 20) %>%
  #kableExtra::pack_rows("URXCPF", 21, 22) %>%
  #kableExtra::pack_rows("URXCPO", 23, 24) %>%
  kableExtra::pack_rows("URX14D", 21, 22) %>%
  kableExtra::pack_rows("URXDCB", 23, 24)
Adjusted Lin Reg
variable weight estNgroup N beta CI_2.5 CI_97.5 t r2 p p.fdr p.fdr.sig
LBXBHC
FarmWorker WT_LBXBHC 238 4257 -0.0672 -0.4100 0.2756 -0.3952 0.4249 0.6956 0.8405
NonCitizen WT_LBXBHC 238 4257 0.8865 0.1082 1.6648 2.2956 0.4249 0.0291 0.0998
LBXDIE
FarmWorker WT_LBXDIE 175 2893 -0.0704 -0.4375 0.2967 -0.3917 0.1987 0.7004 0.8405
NonCitizen WT_LBXDIE 175 2893 -0.0486 -0.2083 0.1110 -0.6220 0.1987 0.5427 0.7772
LBXHPE
FarmWorker WT_LBXHPE 229 3965 0.0313 -0.0963 0.1590 0.4948 0.2560 0.6245 0.8326
NonCitizen WT_LBXHPE 229 3965 -0.0948 -0.2848 0.0952 -1.0058 0.2560 0.3228 0.5960
LBXPDE
FarmWorker WT_LBXPDE 242 4302 0.2169 -0.0501 0.4839 1.6371 0.4111 0.1124 0.2698
NonCitizen WT_LBXPDE 242 4302 0.9076 0.6071 1.2082 6.0865 0.4111 1.2567e-06 1.0054e-05
LBXPDT
FarmWorker WT_LBXPDT 231 4047 0.0478 -0.3216 0.4172 0.2606 0.3555 0.7962 0.8686
NonCitizen WT_LBXPDT 231 4047 1.2967 0.9675 1.6260 7.9374 0.3555 9.3909e-09 2.2538e-07
URX24D
FarmWorker WT_URX24D 330 10652 0.4239 -0.1795 1.0273 1.3927 0.2876 0.1672 0.3649
NonCitizen WT_URX24D 330 10652 0.1235 -0.2849 0.5318 0.5993 0.2876 0.5505 0.7772
URXDEA
FarmWorker WT_URXDEA 151 7073 -0.3208 -2.4766 1.8350 -0.2973 0.4904 0.7676 0.8686
NonCitizen WT_URXDEA 151 7073 -0.1515 -1.7990 1.4961 -0.1837 0.4904 0.8551 0.8923
URXOPM
FarmWorker WT_URXOPM 250 9108 -0.1557 -0.6708 0.3594 -0.6003 0.1483 0.5501 0.7772
NonCitizen WT_URXOPM 250 9108 -0.3848 -0.7512 -0.0183 -2.0850 0.1483 0.0405 0.1081
URXPAR
FarmWorker WT_URXPAR 244 9064 -0.5113 -0.6986 -0.3241 -5.4240 0.1351 7.0264e-07 8.4316e-06
NonCitizen WT_URXPAR 244 9064 0.1807 0.0461 0.3154 2.6653 0.1351 9.4393e-03 0.0378
URXCPM
FarmWorker WT_URXCPM 206 7610 -0.0859 -0.3046 0.1329 -0.7819 0.3421 0.4375 0.7500
NonCitizen WT_URXCPM 206 7610 0.0032 -0.1742 0.1807 0.0364 0.3421 0.9711 0.9711
URX14D
FarmWorker WT_URX14D 274 10256 -0.4900 -0.7958 -0.1842 -3.1820 0.1819 2.1290e-03 0.0102
NonCitizen WT_URX14D 274 10256 0.6153 0.0441 1.1866 2.1387 0.1819 0.0357 0.1072
URXDCB
FarmWorker WT_URXDCB 274 10256 -0.5007 -0.7882 -0.2132 -3.4580 0.1780 9.0012e-04 5.4007e-03
NonCitizen WT_URXDCB 274 10256 0.2923 -0.1575 0.7421 1.2904 0.1780 0.2009 0.4018

Models for consideration in manuscript

Non-parametric Wilcoxon-Mann-Whitney U

wilcoxon_table[-1] %>% filter(ROC>0.6, p.fdr.sig=="*") -> wilcoxon_table_sm2

kableExtra::kable(wilcoxon_table_sm2, caption="Wilcoxon-Mann-Whitney U", booktabs = T, align = c("l","l","r","r","r","r","r","l","l","r")) %>%
  kableExtra::kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F) %>%
  kableExtra::pack_rows("LBXBHC", 1, 1) %>%
  kableExtra::pack_rows("LBXPDE", 2, 2) %>%
  kableExtra::pack_rows("LBXPDT", 3, 3)
Wilcoxon-Mann-Whitney U
variable weight estNgroup N t df ROC p p.fdr p.fdr.sig
LBXBHC
NonCitizen WT_LBXBHC 470 4605 11.9385 43.0000 0.6802 3.0611e-15 7.3467e-14
LBXPDE
NonCitizen WT_LBXPDE 473 4654 8.7163 43.0000 0.6941 4.7005e-11 5.6406e-10
LBXPDT
NonCitizen WT_LBXPDT 447 4391 6.2875 43.0000 0.6934 1.3952e-07 8.3714e-07

Logistic regression

logreg_table_adj[-c(1:2)] %>% filter(ROC>0.6, p.fdr.sig=="*") -> logreg_table_adj_sm2

# Adjusted table
kableExtra::kable(logreg_table_adj_sm2,caption="Adjusted Log Reg", booktabs = T, align = c("l","l","r","r","r","r","r","r","r","l","l","r")) %>%
  kableExtra::kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = F) %>%
  kableExtra::pack_rows("LBXBHC", 1,1) %>%
  kableExtra::pack_rows("LBXPDE", 2,2) %>%
  kableExtra::pack_rows("LBXPDT", 3,3) %>%
  kableExtra::pack_rows("URX24D", 4,4) %>%
  kableExtra::pack_rows("URXDEA", 5,5) %>%
  kableExtra::pack_rows("URXPAR", 6,6) %>%
  kableExtra::pack_rows("URX14D", 7,7)
Adjusted Log Reg
variable weight estNgroup N OR CI_2.5 CI_97.5 t ROC p p.fdr p.fdr.sig
LBXBHC
NonCitizen WT_LBXBHC 238 4257 8.1014 4.4360 14.7957 7.0003 0.8689 1.0693e-07 1.3293e-06
LBXPDE
NonCitizen WT_LBXPDE 242 4302 2.5968 1.3079 5.1560 2.8041 0.7954 8.9103e-03 0.0238
LBXPDT
NonCitizen WT_LBXPDT 231 4047 7.7490 2.2183 27.0690 3.2991 0.9152 2.5729e-03 0.0103
URX24D
FarmWorker WT_URX24D 330 10652 3.7574 2.3687 5.9602 5.6869 0.7338 1.6616e-07 1.3293e-06
URXDEA
FarmWorker WT_URXDEA 151 7073 3.5516 1.3165 9.5813 2.5513 0.7138 0.0140 0.0321
URXPAR
FarmWorker WT_URXPAR 244 9064 0.5364 0.3622 0.7944 -3.1494 0.7681 2.3604e-03 0.0103
URX14D
NonCitizen WT_URX14D 274 10256 1.4771 1.1226 1.9434 2.8226 0.7397 6.0954e-03 0.0195

Exploration of farmworker subset

Estimated probability of bioactivity (significant Log Reg adjusted results)

logreg_table_adj_sm2 %>%
  mutate(weight = gsub("WT_", "", weight)) %>%
  pull(weight) -> chem_names_sig

logreg_table_adj_sm2 %>%
  pull(weight) -> weight_names_sig

logreg_table_adj_sm2 %>%
  pull(variable) -> var_names_sig


logreg_table_adj_sm2 %>%
  mutate(chemical = gsub("WT_", "", weight)) %>%
  left_join(mw %>% select(chemical, commonname)) %>%
  pull(commonname) -> common_names_sig

modellist <- vector(mode = "list", length = length(chem_names_sig))
valuelist <- vector(mode = "list", length = length(chem_names_sig))
plotlist  <- vector(mode = "list", length = length(chem_names_sig))

for(i in seq(1:length(chem_names_sig))) {
  demo_dat %>%
    dplyr::select(!starts_with("WT_")) %>%
    left_join(chem_vars_for_analysis %>%
                filter(chemical==chem_names_sig[i]) %>%
                select(-chemical)) -> temp_dat1
  demo_dat %>%
    dplyr::select(weight_names_sig[i]) -> temp_dat2
  colnames(temp_dat2) <- c("WT")
  cbind(temp_dat1, temp_dat2) %>% filter(!is.na(WT)) %>%
    mutate(nomiss.lb.adj = !is.na(farmwork) & !is.na(DMDCITZN) &
             !is.na(indiv_chem_bioactiv) & !is.na(BMXBMI) & !is.na(RIDAGEYR) &
             !is.na(SDDSRVYR) & !is.na(RIAGENDR) & !is.na(edu) &
             !is.na(RIDRETH1) & RIDAGEYR >= min_age & WT > 0,
           nomiss.ur.adj = !is.na(farmwork) & !is.na(DMDCITZN) &
             !is.na(indiv_chem_bioactiv) & !is.na(BMXBMI) & !is.na(RIDAGEYR) &
             !is.na(SDDSRVYR) & !is.na(RIAGENDR) & !is.na(edu) &
             !is.na(RIDRETH1) & !is.na(URXUCR_umol) & RIDAGEYR >= min_age &
             WT > 0) -> temp_dat
  
  svydesign(strata  = temp_dat$SDMVSTRA, id   = temp_dat$SDMVPSU,
            weights = temp_dat$WT,       data = temp_dat, nest=T) -> nhanes.svy
  
  if(grepl('^LB', chem_names_sig[i])){
    subset(nhanes.svy, nomiss.lb.adj) -> svy.nomiss.adj
    
    temp_dat %>% filter(nomiss.lb.adj) -> no.na.data2
  } else {
    subset(nhanes.svy, nomiss.ur.adj) -> svy.nomiss.adj
    
    temp_dat %>% filter(nomiss.ur.adj) -> no.na.data2
  }
  
  #model included
  if(grepl('^LB', chem_names_sig[i])){
    svyglm(indiv_chem_bioactiv ~ farmwork + DMDCITZN + BMXBMI +
             RIDAGEYR + SDDSRVYR + RIAGENDR + edu + RIDRETH1,
           design = svy.nomiss.adj, family = quasibinomial(link = "logit")) -> fw_adj
  } else {
    svyglm(indiv_chem_bioactiv ~ farmwork + DMDCITZN + BMXBMI +
             RIDAGEYR + SDDSRVYR + RIAGENDR + edu + RIDRETH1 + log(URXUCR_umol),
           design = svy.nomiss.adj, family = quasibinomial(link = "logit")) -> fw_adj
  }
  
  modellist[[i]] <- fw_adj
  
  #other values needed
  if(grepl('^LB', chem_names_sig[i])){
    BMXBMI_avg   <- svymean(x = ~ BMXBMI, design = svy.nomiss.adj)[1]
    SDDSRVYR_med <- svyquantile(~as.numeric(SDDSRVYR),svy.nomiss.adj,
                              c(0.5))$`as.numeric(SDDSRVYR)`[2]
    URXUCR_avg   <- NA_real_
    RIDAGEYR_min <- min(no.na.data2$RIDAGEYR)
    RIDAGEYR_max <- max(no.na.data2$RIDAGEYR)
  } else {
    BMXBMI_avg   <- svymean(x = ~ BMXBMI, design = svy.nomiss.adj)[1]
    SDDSRVYR_med <- svyquantile(~as.numeric(SDDSRVYR),svy.nomiss.adj,
                              c(0.5))$`as.numeric(SDDSRVYR)`[2]
    URXUCR_avg   <- svymean(x = ~ log(URXUCR_umol), design = svy.nomiss.adj)[1]
    RIDAGEYR_min <- min(no.na.data2$RIDAGEYR)
    RIDAGEYR_max <- max(no.na.data2$RIDAGEYR)
  }
  
  data.frame(chem         = c(chem_names_sig[i]),
             commonname   = common_names_sig[i],
             BMXBMI       = BMXBMI_avg,
             SDDSRVYR     = SDDSRVYR_med,
             URXUCR       = URXUCR_avg,
             RIDAGEYR_min = RIDAGEYR_min,
             RIDAGEYR_max = RIDAGEYR_max) ->data_rows
  rownames(data_rows)<-NULL
  
  valuelist[[i]] <- data_rows
  #print(i)
}

do.call(rbind, valuelist) -> logreg_var_values

# create plots of estimated probability of bioactivity
for(i in seq(1:length(chem_names_sig))) {
  RIDAGEYR=seq(logreg_var_values$RIDAGEYR_min[i],
               logreg_var_values$RIDAGEYR_max[i],length=50)
  
  if(grepl('^LB', chem_names_sig[i])){
    expand.grid(farmwork = c(0,1), DMDCITZN = c(0,1), BMXBMI = logreg_var_values$BMXBMI[i],
              RIDAGEYR = RIDAGEYR, SDDSRVYR = logreg_var_values$SDDSRVYR[i], RIAGENDR = 1,
              edu = c(3,4), RIDRETH1 = c(1,3,4)) -> temp_data
  } else {
    expand.grid(farmwork = c(0,1), DMDCITZN = c(0,1), BMXBMI = logreg_var_values$BMXBMI[i],
              RIDAGEYR = RIDAGEYR, SDDSRVYR = logreg_var_values$SDDSRVYR[i], RIAGENDR = 1,
              edu = c(3,4), RIDRETH1 = c(1,3,4),
              URXUCR_umol = exp(logreg_var_values$URXUCR[i])) -> temp_data
  }
  
  temp_data %>%
    mutate(farmwork = as.factor(farmwork),
           DMDCITZN = as.factor(DMDCITZN),
           SDDSRVYR = as.factor(SDDSRVYR),
           RIAGENDR = as.factor(RIAGENDR),
           edu      = as.factor(edu),
           RIDRETH1 = as.factor(RIDRETH1)
    ) -> test_data
  
  predict(modellist[[i]],newdata=test_data) %>%
    as.data.frame() -> pred_data
  
  cbind(test_data, pred_data) %>%
    mutate(est_prob = exp(link)/(1+exp(link)),
           education = case_when(edu == 3 ~ "High School\nGrad / GED",
                                 edu == 4 ~ "Some College or\nAA degree",
                                 TRUE     ~ NA_character_),
           race = case_when(RIDRETH1 == 3 ~ "Non-Hispanic White",
                            RIDRETH1 == 4 ~ "Non-Hispanic Black",
                            RIDRETH1 == 1 ~ "Mexican American",
                            TRUE          ~ NA_character_),
           farmworker = case_when(farmwork == 1 ~ "Farmworker",
                                  farmwork == 0 ~ "Non-Farmworker",
                                  TRUE          ~ NA_character_),
           citizen = case_when(DMDCITZN == 1 ~ "Non-US Citizen",
                               DMDCITZN == 0 ~ "US Citizen",
                               TRUE          ~ NA_character_),
           farm_citiz = case_when(DMDCITZN == 1 & farmwork == 1 ~ "Non-Citizen, Farmworker",
                                  DMDCITZN == 1 & farmwork == 0 ~ "Non-Citizen, Non-Farmworker",
                                  DMDCITZN == 0 & farmwork == 1 ~ "US-Citizen, Farmworker",
                                  DMDCITZN == 0 & farmwork == 0 ~ "US-Citizen, Non-Farmworker",
                                  TRUE ~ NA_character_)) -> test_plot
  
  
  test_plot$farmworker <- factor(test_plot$farmworker, levels = c("Farmworker",
                                                                  "Non-Farmworker"))
  test_plot$citizen <- factor(test_plot$citizen, levels = c("Non-US Citizen",
                                                            "US Citizen"))
  test_plot$education <- factor(test_plot$education, levels = c("High School\nGrad / GED",
                                                                "Some College or\nAA degree"))
  test_plot$race <- factor(test_plot$race, levels = c("Non-Hispanic White",
                                                      "Mexican American",
                                                      "Non-Hispanic Black"))
  test_plot$farm_citiz <- factor(test_plot$farm_citiz, levels = c("Non-Citizen, Farmworker",
                                                                  "Non-Citizen, Non-Farmworker",
                                                                  "US-Citizen, Farmworker",
                                                                  "US-Citizen, Non-Farmworker"))
  
  test_plot %>%
    #filter(DMDCITZN==1, farmwork==1) %>%
    ggplot(aes(x=RIDAGEYR, y=est_prob, group=farm_citiz, color=farm_citiz)) +
    #ggplot(aes(x=BMXBMI, y=est_prob, group=farm_citiz, color=farm_citiz)) +
    facet_grid(education ~ race)+
    #geom_jitter(data = dat_training, aes(x=como_asthma,y=cCFR),color="gray", size=0.25)+
    geom_line()+
    #geom_line(aes(x = `Asthma Pop. (%)`, y = lower), linetype = 2) + 
    #geom_line(aes(x = `Asthma Pop. (%)`, y = upper), linetype = 2) +
    theme_bw() + theme(panel.grid.major = element_blank(),
                       panel.grid.minor = element_blank(), 
                       legend.position = "bottom",
                       legend.title= element_blank(),
                       axis.title = element_text(size=12),
                       axis.text = element_text(size=10),
                       text = element_text(size = 10),
                       legend.text = element_text(size=8)
    )+
    guides(color=guide_legend(nrow=2, byrow=TRUE)) +
    ggtitle(logreg_var_values$commonname[i]) +
    xlab("Age") +
    ylab("Est. Probability of Bioactivity") -> plotlist[[i]]
}

plotlist[[1]]

#ggsave("suppl_fig1.pdf", width=6.5, height=4, units="in", device=cairo_pdf)

plotlist[[2]]

#ggsave("suppl_fig2.pdf", width=6.5, height=4, units="in", device=cairo_pdf)

plotlist[[3]]

#ggsave("suppl_fig3.pdf", width=6.5, height=4, units="in", device=cairo_pdf)

plotlist[[4]]

#ggsave("suppl_fig4.pdf", width=6.5, height=4, units="in", device=cairo_pdf)

plotlist[[5]]

#ggsave("suppl_fig5.pdf", width=6.5, height=4, units="in", device=cairo_pdf)

plotlist[[6]]

#ggsave("suppl_fig6.pdf", width=6.5, height=4, units="in", device=cairo_pdf)

plotlist[[7]]

#ggsave("suppl_fig7.pdf", width=6.5, height=4, units="in", device=cairo_pdf)

Main manuscript figures

Volcano Plot (Log Reg adjusted results)

logreg_results_adj %>%
  mutate(p.fdr = p.adjust(p, method="fdr"),
         p.fdr.sig = ifelse(p.fdr<0.05,"*","")) %>%
  filter(!is.na(OR)) %>%
  filter(str_detect(weight, "_[ABC]_", negate=T)) %>%
  left_join(mw %>% select(chem=chemical, commonname)) %>%
  mutate(label=
           case_when(p.fdr < 0.05 ~ commonname,
                     OR> 1.5 ~ commonname,
                     TRUE ~ NA_character_),
         variable = as.factor(variable),
         color = as.factor(ifelse(p.fdr < 0.05, "p < 0.05", "p \u2265 0.05"))) -> vol_plot_data

#Reverse log scale for y axis
reverselog_trans <- function(base = exp(1)) {
  trans <- function(x) -log(x, base)
  inv <- function(x) base^(-x)
  trans_new(paste0("reverselog-", format(base)), trans, inv,
            log_breaks(base = base),
            domain = c(1e-100, Inf))
}

#Breaks for FDR legend
my_breaks = c(1e-8, 1e-5, 5e-2, 1)

#Italicized y axis title
my_y_title <- expression(paste(italic("p"), "-value (FDR)"))

vol_plot_data %>%
  ggplot(aes(x=OR, y=p.fdr, shape=variable, fill=color, label=label), size=1) +
  geom_vline(xintercept = 1, linetype="dotted") +
  #geom_hline(yintercept = 0.05, linetype="dotted") +
  geom_point(size=2) +
  theme_bw() +
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        #legend.position="bottom", legend.box="vertical", #legend.margin=margin(),
        #legend.position=c(2,0.005), legend.box="horizontal", 
        legend.title = element_text(size=12),
        axis.title = element_text(size=12),
        axis.text = element_text(size=10),
        text = element_text(size = 10),
        legend.margin=margin(t = 0.2, unit='cm'),
        legend.text = element_text(size=10)) +
  scale_shape_manual(values = c(21,24)) +
  scale_fill_manual(name="Significance", values=c('black','gray75'),
                     labels = c(expression(paste(italic("p"), " < 0.05")),
                                expression(paste(italic("p"), " \u2265 0.05"))),
    guide=guide_legend(override.aes =
                         list(color=c('black','gray75'),
                              #shape = c(21,23),
                              order=1))) +
  # scale_color_gradient(name="FDR", trans="log", #values=c('black','gray90'),
  #                      low = "#56B1F7", high = "#132B43",
  #                      breaks = my_breaks, labels = my_breaks) +
  labs(shape="Category") +
  scale_x_continuous(breaks=seq(-1,11,by=2),limits=c(-1, 12)) +
  scale_y_continuous(trans=reverselog_trans(10), limits=c(1, 1e-10),
                     breaks=c(1e-01,1e-04,1e-07,1e-10)) +
  xlab("Odds Ratio") +
  ylab(my_y_title)+
  ggrepel::geom_label_repel(aes(x=OR, y=p.fdr, label = label,
                       #fill=factor(color),
                       #colour="white"
                       ),
                       min.segment.length = 0, seed=42,
                           nudge_x = 1,
                           box.padding = 0.5,
                       force=2,
                       force_pull=0,
                       nudge_y = ifelse(vol_plot_data$label == "Beta-hexachlorocyclohexane", 3, 0),
                   show.legend = FALSE,
                   direction = "both",
                   inherit.aes = FALSE)

#Save figure for publication
#ggsave("vol_plot.pdf", width=6.5, height=3.5, units="in", device=cairo_pdf)

Boxplot Plot (NHANES and Model ACC)

full_chem_demo_dat %>%
  select(casn, molarity) %>%
  #filter(!is.na(molarity)) %>%
  group_by(casn) %>%
  left_join(toxcast %>% select(casn, chnm) %>% unique()) %>%
  ungroup() %>%
  mutate(group="NHANES") -> df_temp

full_chem_demo_dat %>%
  filter(farmwork==1) %>%
  select(casn, molarity) %>%
  #filter(!is.na(molarity)) %>%
  group_by(casn) %>%
  left_join(toxcast %>% select(casn, chnm) %>% unique()) %>%
  ungroup() %>%
  mutate(group="Farmwork") -> df_temp_fw

full_chem_demo_dat %>%
  filter(DMDCITZN==1) %>%
  select(casn, molarity) %>%
  #filter(!is.na(molarity)) %>%
  group_by(casn) %>%
  left_join(toxcast %>% select(casn, chnm) %>% unique()) %>%
  ungroup() %>%
  mutate(group="Noncitizen") -> df_temp_nc

toxcast %>%
  filter(hitc==1) %>%
  select(casn, molarity=modl_acc, chnm) %>%
  filter(!is.na(molarity)) %>%
  mutate(group="Model ACC") %>%
  rbind(df_temp) %>%
  rbind(df_temp_fw) %>%
  rbind(df_temp_nc) %>%
  mutate(chnm = as.factor(chnm),
         group = as.factor(group)) %>%
  filter(!chnm %in% c("Chlorpyrifos oxon", "Chlorpyrifos")) %>%
  mutate(persistent = ifelse(casn %in% c("50-29-3", "60-57-1",
                                         "76-44-8", "72-55-9",
                                         "319-85-7"),
                             "Serum Biomarkers", "Urinary Biomarkers")) -> box_plot_data

box_plot_data$group <- factor(box_plot_data$group,
                              levels = c("Model ACC", "Farmwork", "Noncitizen", "NHANES"))
box_plot_data$persistent <- factor(box_plot_data$persistent,
                              levels = c("Serum Biomarkers",
                                         "Farmwork", "Urinary Biomarkers"))

box_plot_data %>%
  select(chnm) %>%
  distinct() %>%
  mutate(chnm_wrap =
           case_when(
             chnm == "3-Phenoxybenzoic acid"           ~ "3-Phenoxybenzoic\nAcid",
             chnm == "Dichlorodiphenyltrichloroethane" ~ "p,p'-DDT",
             chnm == "beta-Hexachlorocyclohexane"      ~ "Beta-hexachloro\ncyclohexane",
             chnm == "2,4-Dichlorophenol"              ~ "2,4-Dichloro\nphenol",
             chnm == "2,4-Dichlorophenoxyacetic acid"  ~ "2,4-Dichlorophen\noxyacetic Acid",
             chnm == "3,5,6-Trichloro-2-pyridinol"     ~ "3,5,6-Trichloro-\n2-pyridinol",
             chnm == "2,5-Dichlorophenol"              ~ "2,5-Dichloro\nphenol",
             chnm == "p,p'-DDE"              ~ "p,p'-DDE",
             chnm == "Heptachlor"              ~ "Heptachlor\nEpoxide",
             chnm == "Dieldrin"              ~ "Dieldrin",
             chnm == "4-Nitrophenol"              ~ "4-Nitrophenol",
             chnm == "DEET"              ~ "DEET Acid",
             TRUE ~ NA_character_
           ))-> chnm_wrap_names

#Order boxplots
box_plot_data %>%
  group_by(chnm, group) %>%
  summarise(med_mol = median(molarity,na.rm=TRUE)) %>%
  ungroup() -> group_medians

box_plot_data %>%
  filter(group %in% c("Model ACC", "NHANES")) %>%
  left_join(group_medians %>% filter(group=="NHANES") %>% select(-group)) %>%
  #filter(group %in% c("Model ACC", "Noncitizen")) %>%
  #left_join(group_medians %>% filter(group=="Noncitizen") %>% select(-group)) %>%
  #filter(group %in% c("Model ACC", "Farmwork")) %>%
  #left_join(group_medians %>% filter(group=="Farmwork") %>% select(-group)) %>%
  left_join(chnm_wrap_names) %>%
  ggplot(aes(x = reorder(chnm_wrap, med_mol),
             y=molarity, fill=group))+
  geom_boxplot(outlier.size=0.25)+
  scale_fill_manual(values=c('gray85','white'))+
  scale_y_continuous(trans="log10")+
  #scale_x_discrete(labels = function(x) str_wrap(x, width = 10)) +
  #facet_wrap(vars(persistent), scales = "free_y", nrow=2)+
  #facet_grid(persistent~., scales = "free_y", space='free_y')+
  ggforce::facet_col(vars(persistent), space="free", scale="free_y") +
  theme_bw() +
  theme(
    axis.text.x= element_text(angle=90,hjust=0.95,vjust=0.5),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        legend.position = "bottom",
        #axis.text.x = element_text(angle = 45, vjust = 1, hjust=1),
        legend.title = element_blank(),
        axis.title = element_text(size=12),
        axis.text = element_text(size=10),
        text = element_text(size = 10),
        legend.text = element_text(size=10)) +
  ylab("Molarity (\u03bcM)") +
  xlab("Chemical") +
  coord_flip()

#ggsave("box_plot.pdf", width=3.5, height=7, units="in", device=cairo_pdf)

Non-parametric demographic data analysis

Look at demographics.

bio.demo <- svydesign(id = ~SDMVPSU, strata = ~SDMVSTRA,
                      family = ~quasibinomial, weights = ~WTMEC16YR,
                      nest = TRUE, data = full_chem_demo_dat)

svyranktest(BMXBMI   ~ indiv_chem_bioactiv, bio.demo, test = c("wilcoxon"))
## 
##  Design-based KruskalWallis test
## 
## data:  BMXBMI ~ indiv_chem_bioactiv
## t = 3.5503, df = 122, p-value = 0.0005474
## alternative hypothesis: true difference in mean rank score is not equal to 0
## sample estimates:
## difference in mean rank score 
##                    0.01112611
svyranktest(RIDAGEYR ~ indiv_chem_bioactiv, bio.demo, test = c("wilcoxon"))
## 
##  Design-based KruskalWallis test
## 
## data:  RIDAGEYR ~ indiv_chem_bioactiv
## t = 8.4893, df = 122, p-value = 5.887e-14
## alternative hypothesis: true difference in mean rank score is not equal to 0
## sample estimates:
## difference in mean rank score 
##                    0.02894323
svyranktest(INDFMPIR ~ indiv_chem_bioactiv, bio.demo, test = c("wilcoxon"))
## 
##  Design-based KruskalWallis test
## 
## data:  INDFMPIR ~ indiv_chem_bioactiv
## t = -6.1025, df = 122, p-value = 1.274e-08
## alternative hypothesis: true difference in mean rank score is not equal to 0
## sample estimates:
## difference in mean rank score 
##                   -0.02346471
svychisq( ~ SDDSRVYR + indiv_chem_bioactiv, bio.demo, statistic = c("Chisq"))
## 
##  Pearson's X^2: Rao & Scott adjustment
## 
## data:  svychisq(~SDDSRVYR + indiv_chem_bioactiv, bio.demo, statistic = c("Chisq"))
## X-squared = 1879.8, df = 7, p-value < 2.2e-16
svychisq( ~ RIAGENDR + indiv_chem_bioactiv, bio.demo, statistic = c("Chisq"))
## 
##  Pearson's X^2: Rao & Scott adjustment
## 
## data:  svychisq(~RIAGENDR + indiv_chem_bioactiv, bio.demo, statistic = c("Chisq"))
## X-squared = 44.829, df = 1, p-value = 4.91e-07
svychisq( ~ RIDRETH1 + indiv_chem_bioactiv, bio.demo, statistic = c("Chisq"))
## 
##  Pearson's X^2: Rao & Scott adjustment
## 
## data:  svychisq(~RIDRETH1 + indiv_chem_bioactiv, bio.demo, statistic = c("Chisq"))
## X-squared = 326.95, df = 4, p-value < 2.2e-16
svychisq( ~ farmwork + indiv_chem_bioactiv, bio.demo, statistic = c("Chisq"))
## 
##  Pearson's X^2: Rao & Scott adjustment
## 
## data:  svychisq(~farmwork + indiv_chem_bioactiv, bio.demo, statistic = c("Chisq"))
## X-squared = 45.543, df = 1, p-value = 2.92e-06
#svychisq( ~ citzn    + indiv_chem_bioactiv, bio.demo, statistic = c("Chisq"))
svychisq( ~ DMDBORN  + indiv_chem_bioactiv, bio.demo, statistic = c("Chisq"))
## 
##  Pearson's X^2: Rao & Scott adjustment
## 
## data:  svychisq(~DMDBORN + indiv_chem_bioactiv, bio.demo, statistic = c("Chisq"))
## X-squared = 483.79, df = 3, p-value < 2.2e-16
svychisq( ~ edu      + indiv_chem_bioactiv, bio.demo, statistic = c("Chisq"))
## 
##  Pearson's X^2: Rao & Scott adjustment
## 
## data:  svychisq(~edu + indiv_chem_bioactiv, bio.demo, statistic = c("Chisq"))
## X-squared = 64.578, df = 4, p-value = 1.244e-07
full_chem_demo_dat %>%
  dplyr::select(SEQN,bioactiv_count) %>%
  arrange(SEQN,bioactiv_count) %>%
  distinct() %>%
  ggplot(aes(x=bioactiv_count)) + 
  geom_histogram(binwidth=1) + xlab("Number of bioactive chemicals within single subject")

demo_dat %>%
  dplyr::select(!starts_with("WT_")) %>%
  left_join(chem_vars_for_analysis %>%
              select(SEQN, indiv_chem_bioactiv) %>%
              filter(!is.na(indiv_chem_bioactiv)) %>%
              group_by(SEQN) %>%
              summarise(indiv_chem_bioactiv = max(indiv_chem_bioactiv)) %>%
              ungroup()) %>%
  left_join(chem_vars_for_analysis %>%
              select(SEQN, URXUCR_umol) %>%
              filter(!is.na(URXUCR_umol)) %>%
              distinct()) %>%
  mutate(nomiss.adj.fw = !is.na(farmwork) & !is.na(indiv_chem_bioactiv) &
             !is.na(BMXBMI) & !is.na(RIDAGEYR) & !is.na(SDDSRVYR) &
             !is.na(RIAGENDR) & !is.na(edu) & !is.na(RIDRETH1) &
             !is.na(URXUCR_umol) & RIDAGEYR >= min_age & WTMEC16YR > 0,
         nomiss.adj.cz = !is.na(DMDCITZN) & !is.na(indiv_chem_bioactiv) &
             !is.na(BMXBMI) & !is.na(RIDAGEYR) & !is.na(SDDSRVYR) &
             !is.na(RIAGENDR) & !is.na(edu) & !is.na(RIDRETH1) &
             !is.na(URXUCR_umol) & RIDAGEYR >= min_age & WTMEC16YR > 0) ->temp_dat

nhanes.svy <- svydesign(id = ~SDMVPSU, strata = ~SDMVSTRA,
                      family = ~quasibinomial, weights = ~WTMEC16YR,
                      nest = TRUE, data = temp_dat)

subset(nhanes.svy, nomiss.adj.fw) -> svy.nomiss.adj.fw
temp_dat %>% filter(nomiss.adj.fw) -> no.na.data.fw

subset(nhanes.svy, nomiss.adj.cz) -> svy.nomiss.adj.cz
temp_dat %>% filter(nomiss.adj.cz) -> no.na.data.cz

svyglm(indiv_chem_bioactiv ~ farmwork + BMXBMI + RIDAGEYR + factor(SDDSRVYR) +
                 factor(RIAGENDR) + factor(edu) + factor(RIDRETH1) + log(URXUCR_umol),
               design = svy.nomiss.adj.fw, family = quasibinomial(link = "logit")) -> fw_adj

print("Farmworkers were 1.15 times more likely to have a bioactive pesticide biomarker measurement in comparison to non-farmworkers.")
## [1] "Farmworkers were 1.15 times more likely to have a bioactive pesticide biomarker measurement in comparison to non-farmworkers."
c(OR=exp(coefficients(fw_adj)[2]),
  CI_2.5   = exp(confint(fw_adj, ddf=degf(fw_adj$survey.design)))[2,1],
  CI_97.5  = exp(confint(fw_adj, ddf=degf(fw_adj$survey.design)))[2,2],
  t        = summary(fw_adj, df.resid = degf(fw_adj$survey.design))$coef[2,3],
  p        = summary(fw_adj)$coefficients[2,4]
  )
## OR.farmwork1       CI_2.5      CI_97.5            t            p 
##    1.1480739    0.8745447    1.5071543    1.0043897    0.3175446
svyglm(indiv_chem_bioactiv ~ DMDCITZN + BMXBMI + RIDAGEYR + factor(SDDSRVYR) +
                 factor(RIAGENDR) + factor(edu) + factor(RIDRETH1) + log(URXUCR_umol),
               design = svy.nomiss.adj.cz, family = quasibinomial(link = "logit")) -> cz_adj

print("Non-U.S. citizens were 1.39 times more likely to have bioactive pesticide biomarker concentrations compared those with U.S. citizenship.")
## [1] "Non-U.S. citizens were 1.39 times more likely to have bioactive pesticide biomarker concentrations compared those with U.S. citizenship."
c(OR=exp(coefficients(cz_adj)[2]),
  CI_2.5   = exp(confint(cz_adj, ddf=degf(cz_adj$survey.design)))[2,1],
  CI_97.5  = exp(confint(cz_adj, ddf=degf(cz_adj$survey.design)))[2,2],
  t        = summary(cz_adj, df.resid = degf(cz_adj$survey.design))$coef[2,3],
  p        = summary(cz_adj)$coefficients[2,4]
  )
## OR.DMDCITZN1       CI_2.5      CI_97.5            t            p 
## 1.3877865551 1.1709518827 1.6447742653 3.8181778724 0.0002298857